This commit is contained in:
Andras Schmelczer 2026-05-31 20:20:41 +01:00
parent 8688b7475e
commit e8345cbdc1
40 changed files with 1980 additions and 904 deletions

View file

@ -64,8 +64,6 @@ PBF := $(DATA_DIR)/england-latest.osm.pbf
FR_TOW := $(DATA_DIR)/FR_TOW_V1_ALL.zip
NFI := $(DATA_DIR)/NFI_WOODLAND_ENGLAND.zip
TREE_DENSITY_PC := $(DATA_DIR)/tree_density_by_postcode.parquet
TREE_DENSITY_STREETS := $(DATA_DIR)/tree_density_by_street.parquet
TREE_DENSITY_ADDR := $(DATA_DIR)/tree_density_by_address.parquet
OFS_REGISTER := $(DATA_DIR)/ofs_register.xlsx
PLACES := $(DATA_DIR)/places.parquet
MEDIAN_AGE := $(DATA_DIR)/median_age.parquet
@ -183,6 +181,7 @@ $(PC_BOUNDARIES_STAMP): $(OA_BOUNDARIES) $(INSPIRE_STAMP) $(UPRN_LOOKUP) $(ARCGI
--oa-boundaries $(OA_BOUNDARIES) \
--inspire $(INSPIRE_DIR) \
--output $(PC_BOUNDARIES)
$(VALIDATE_OUTPUTS) --active-postcode-boundary-match "$(ARCGIS)::$(PC_BOUNDARIES)"
@touch $@
generate-travel-times: $(ARCGIS) $(PLACES) $(PBF) download-transit-network
@if [ -f "$(R5_NETWORK_CACHE)" ] && { [ "$(PBF)" -nt "$(R5_NETWORK_CACHE)" ] || [ "$(TRANSIT_STAMP)" -nt "$(R5_NETWORK_CACHE)" ]; }; then \
@ -358,7 +357,7 @@ $(POIS_FILTERED): $(POIS_RAW) $(NAPTAN) $(GROCERY_RETAIL_POINTS) $(GIAS) $(OFSTE
$(EPC_PP): $(PRICE_PAID) $(EPC) pipeline/transform/join_epc_pp.py pipeline/utils/fuzzy_join.py
uv run python -m pipeline.transform.join_epc_pp --epc $(EPC) --price-paid $(PRICE_PAID) --output $@
$(CRIME) $(CRIME_BY_YEAR) &: $(CRIME_STAMP) $(PC_BOUNDARIES) pipeline/transform/crime_spatial.py pipeline/transform/postcode_boundaries/loader.py pipeline/transform/crime.py
$(CRIME) $(CRIME_BY_YEAR) &: $(CRIME_STAMP) $(PC_BOUNDARIES_STAMP) pipeline/transform/crime_spatial.py pipeline/transform/postcode_boundaries/loader.py pipeline/transform/crime.py
$(VALIDATE_OUTPUTS) --file $(CRIME_DIR)/archive_manifest.json --glob "$(CRIME_DIR)::**/*-street.csv"
uv run python -m pipeline.transform.crime_spatial --input $(CRIME_DIR) --boundaries $(PC_BOUNDARIES)/units --output $(CRIME) --output-by-year $(CRIME_BY_YEAR)
@ -368,15 +367,12 @@ $(POI_PROXIMITY): $(ARCGIS) $(POIS_FILTERED) $(OS_GREENSPACE) $(POI_PROXIMITY_DE
$(SCHOOL_PROX): $(OFSTED) $(ARCGIS) pipeline/transform/school_proximity.py pipeline/utils/poi_counts.py
uv run python -m pipeline.transform.school_proximity --ofsted $(OFSTED) --arcgis $(ARCGIS) --output $@
$(TREE_DENSITY_PC): $(FR_TOW) $(NFI) $(ARCGIS) $(PRICE_PAID) $(TREE_DENSITY_DEPS)
$(TREE_DENSITY_PC): $(FR_TOW) $(NFI) $(ARCGIS) $(TREE_DENSITY_DEPS)
uv run python -m pipeline.transform.tree_density \
--tow-zip $(FR_TOW) \
--nfi-zip $(NFI) \
--arcgis $(ARCGIS) \
--price-paid $(PRICE_PAID) \
--output-postcodes $(TREE_DENSITY_PC) \
--output-streets $(TREE_DENSITY_STREETS) \
--output-addresses $(TREE_DENSITY_ADDR)
--output-postcodes $(TREE_DENSITY_PC)
# Postcode boundaries require manual generation — fail with instructions
$(PC_BOUNDARIES):

View file

@ -81,6 +81,15 @@ function isProtectedPage(page: Page): boolean {
return page === 'account' || page === 'saved';
}
function isSharedDashboardUrl(): boolean {
const share = new URLSearchParams(window.location.search).get('share');
return !!share && /^[a-z0-9]{1,20}$/i.test(share);
}
function isAuthRequiredRoute(page: Page): boolean {
return isProtectedPage(page) || (page === 'dashboard' && !isSharedDashboardUrl());
}
function buildPageUrl(page: Page, inviteCode?: string, search = '', hash = ''): string {
const normalizedHash = normalizeHash(hash);
return `${pageToPath(page, inviteCode)}${search}${normalizedHash ? `#${normalizedHash}` : ''}`;
@ -235,6 +244,7 @@ export default function App() {
const postAuthCheckoutReturnPathRef = useRef<string | null>(null);
const authCompletedRef = useRef(false);
const [licenseSuccessStatus, setLicenseSuccessStatus] = useState<LicenseSuccessStatus>('hidden');
const [dashboardReady, setDashboardReady] = useState(false);
// Keep a ref to the latest refreshAuth so the mount-only startup effect always
// calls the current implementation without re-running when the callback identity changes.
@ -266,7 +276,7 @@ export default function App() {
if (!completed) {
setPostAuthIntent(null);
postAuthCheckoutReturnPathRef.current = null;
if (isProtectedPage(activePageRef.current)) {
if (isAuthRequiredRoute(activePageRef.current)) {
window.history.replaceState({ page: 'home', hash: '' }, '', '/');
setRouteHash('');
setActivePage('home');
@ -517,7 +527,10 @@ export default function App() {
}
}, [activePage, fetchSearches]);
const isAuthRequiredPage = activePage === 'account' || activePage === 'saved';
const isAuthRequiredPage =
activePage === 'account' ||
activePage === 'saved' ||
(activePage === 'dashboard' && !mapUrlState.share);
useEffect(() => {
if (authLoading) return;
if (isAuthRequiredPage && !user) {
@ -530,6 +543,13 @@ export default function App() {
const [exportState, setExportState] = useState<ExportState | null>(null);
useEffect(() => {
if (activePage !== 'dashboard' || !user) {
setDashboardReady(false);
setExportState(null);
}
}, [activePage, user]);
if ((isScreenshotMode || isOgMode) && inviteCode) {
return (
<Suspense fallback={<PageFallback />}>
@ -584,8 +604,9 @@ export default function App() {
onPageChange={navigateTo}
theme={theme}
onToggleTheme={toggleTheme}
exportState={activePage === 'dashboard' ? exportState : null}
exportState={activePage === 'dashboard' && user ? exportState : null}
dashboardParams={activePage === 'dashboard' ? dashboardParams : ''}
dashboardActionsDisabled={activePage === 'dashboard' && !dashboardReady}
onSaveSearch={
activePage === 'dashboard' && user
? editingSearch
@ -675,6 +696,7 @@ export default function App() {
onNavigateTo={navigateTo}
onExportStateChange={setExportState}
onDashboardParamsChange={setDashboardParams}
onDashboardReadyChange={setDashboardReady}
isMobile={isMobile}
initialTravelTime={mapUrlState.travelTime}
initialPostcode={mapUrlState.postcode}

View file

@ -461,6 +461,24 @@ interface ShareLinkListItem {
created: string;
}
function latestPendingInviteUrls(invites: InviteListItem[]): Record<string, string> {
const latestByType: Record<string, { url: string; createdMs: number }> = {};
for (const invite of invites) {
if (invite.used || !invite.url) continue;
const createdMs = Date.parse(invite.created) || 0;
const existing = latestByType[invite.invite_type];
if (!existing || createdMs > existing.createdMs) {
latestByType[invite.invite_type] = { url: invite.url, createdMs };
}
}
return Object.fromEntries(
Object.entries(latestByType).map(([type, invite]) => [type, invite.url])
);
}
function InviteTable({
invites,
loading,
@ -673,7 +691,16 @@ function InviteSection({ user }: { user: AuthUser }) {
const res = await fetch(apiUrl('invites'), authHeaders());
assertOk(res, 'Fetch invites');
const data = await res.json();
setInviteHistory(data.invites);
const invites: InviteListItem[] = Array.isArray(data.invites) ? data.invites : [];
setInviteHistory(invites);
const pendingInviteUrls = latestPendingInviteUrls(invites);
setInviteUrl((prev) => {
const next = { ...prev };
for (const [type, url] of Object.entries(pendingInviteUrls)) {
if (!next[type]) next[type] = url;
}
return next;
});
} catch {
// Silent — non-critical
} finally {

View file

@ -8,8 +8,11 @@ const RECENT_SEARCHES_STORAGE_KEY = 'perfect-postcode.locationSearch.recent';
vi.mock('react-i18next', () => ({
useTranslation: () => ({
t: (key: string) =>
key === 'locationSearch.placeholder' ? 'Search places or postcodes...' : key,
t: (key: string) => {
if (key === 'locationSearch.placeholder') return 'Search places or postcodes...';
if (key === 'locationSearch.noResults') return 'No matching places or postcodes';
return key;
},
}),
}));
@ -226,6 +229,91 @@ describe('LocationSearch', () => {
);
});
it('selects the first place suggestion with Enter when none is highlighted', async () => {
vi.stubGlobal(
'fetch',
vi.fn((input: string | URL | Request) => {
const url = new URL(String(input), 'http://localhost');
if (url.pathname === '/api/places') {
return Promise.resolve(
jsonResponse({
places: [
{
type: 'place',
name: 'London',
slug: 'london',
place_type: 'city',
lat: 51.5074,
lon: -0.1278,
},
],
postcodes: [],
addresses: [],
})
);
}
if (url.pathname === '/api/nearest-postcode') {
return Promise.resolve(
jsonResponse({
postcode: 'SW1A 1AA',
latitude: 51.501,
longitude: -0.141,
geometry: postcodeGeometry,
})
);
}
return Promise.resolve(new Response(null, { status: 404 }));
})
);
const onFlyTo = vi.fn();
const onLocationSearched = vi.fn();
render(<LocationSearch onFlyTo={onFlyTo} onLocationSearched={onLocationSearched} />);
const input = screen.getByRole('textbox');
fireEvent.change(input, { target: { value: 'London' } });
await screen.findByRole('button', { name: 'London' });
fireEvent.keyDown(input, { key: 'Enter' });
await waitFor(() => {
expect(onLocationSearched).toHaveBeenCalledTimes(1);
});
expect(onFlyTo).toHaveBeenCalledWith(51.5074, -0.1278, 10);
expect(onLocationSearched).toHaveBeenCalledWith({
postcode: 'SW1A 1AA',
geometry: postcodeGeometry,
latitude: 51.501,
longitude: -0.141,
zoom: 10,
markerLatitude: 51.5074,
markerLongitude: -0.1278,
});
});
it('shows an empty state for invalid place queries', async () => {
vi.stubGlobal(
'fetch',
vi.fn(() =>
Promise.resolve(
jsonResponse({
places: [],
postcodes: [],
addresses: [],
})
)
)
);
render(<LocationSearch onFlyTo={vi.fn()} onLocationSearched={vi.fn()} />);
fireEvent.change(screen.getByRole('textbox'), { target: { value: '!!!!zzzzzz!!!!' } });
await waitFor(() => {
expect(screen.getByText('No matching places or postcodes')).toBeTruthy();
});
});
it('keeps only the three most recent local searches', async () => {
vi.stubGlobal(
'fetch',

View file

@ -333,6 +333,8 @@ export default function LocationSearch({
onSelect={selectResult}
loading={loading}
placeholder={t('locationSearch.placeholder')}
ariaLabel={t('locationSearch.searchLabel')}
name="location-search"
size="sm"
inputClassName={
inputClassName ??

View file

@ -91,6 +91,7 @@ export default function MapPage({
onNavigateTo,
onExportStateChange,
onDashboardParamsChange,
onDashboardReadyChange,
screenshotMode,
ogMode,
isMobile = false,
@ -642,6 +643,23 @@ export default function MapPage({
onDashboardParamsChange?.(dashboardParams);
}, [dashboardParams, onDashboardParamsChange]);
const dashboardReady =
!initialLoading &&
!mapData.loading &&
!mapData.licenseRequired &&
mapData.bounds != null &&
mapData.currentView != null;
useEffect(() => {
onDashboardReadyChange?.(dashboardReady);
}, [dashboardReady, onDashboardReadyChange]);
useEffect(() => {
return () => {
onDashboardReadyChange?.(false);
};
}, [onDashboardReadyChange]);
useEffect(() => {
if (mapData.licenseRequired) trackEvent('Upgrade Modal Shown');
}, [mapData.licenseRequired]);
@ -830,8 +848,8 @@ export default function MapPage({
</button>
<button
onClick={() => onUpdateEdit?.(dashboardParams)}
disabled={savingSearch}
className="shrink-0 cursor-pointer px-2.5 py-1 rounded text-xs font-medium bg-teal-600 text-white hover:bg-teal-700 disabled:opacity-50 disabled:cursor-wait flex items-center gap-1.5"
disabled={savingSearch || !dashboardReady}
className="shrink-0 cursor-pointer px-2.5 py-1 rounded text-xs font-medium bg-teal-600 text-white hover:bg-teal-700 disabled:opacity-50 disabled:cursor-not-allowed flex items-center gap-1.5"
>
{savingSearch ? t('savedPage.updating') : t('common.update')}
</button>

View file

@ -186,7 +186,7 @@ export default function POIPane({
</div>
{!isCollapsed && (
<div className="px-3 py-2">
<PillGroup>
<PillGroup wrap>
{group.categories.map((category) => {
const logo = getPoiCategoryLogoUrl(category);
return (

View file

@ -269,7 +269,10 @@ export function DesktopMapPage({
</div>
)}
{poiPaneOpen && (
<div className="absolute bottom-28 right-4 z-10 flex h-[60vh] min-h-0 w-80 flex-col overflow-hidden rounded-lg border border-warm-200 bg-white shadow-xl dark:border-warm-700 dark:bg-warm-900">
<div
className="absolute bottom-28 right-4 z-10 flex min-h-0 w-80 max-w-[calc(100%_-_2rem)] flex-col overflow-hidden rounded-lg border border-warm-200 bg-white shadow-xl dark:border-warm-700 dark:bg-warm-900"
style={{ height: 'min(30rem, calc(100vh - 10rem))' }}
>
{poiPane}
</div>
)}

View file

@ -132,6 +132,10 @@ export function MobileMapPage({
upgradeModal,
editingBar,
}: MobileMapPageProps) {
const floatingPaneAvailableHeight = `max(12rem, calc(100dvh - ${Math.ceil(
bottomScreenInset
)}px - 7rem))`;
return (
<div className="flex-1 overflow-hidden relative">
<LoadingOverlay show={initialLoading} />
@ -219,7 +223,13 @@ export function MobileMapPage({
)}
{poiPaneOpen && (
<div className="absolute top-24 right-3 left-3 z-20 flex h-[45dvh] min-h-0 flex-col overflow-hidden rounded-lg border border-warm-200 bg-white shadow-xl dark:border-warm-700 dark:bg-warm-900">
<div
className="absolute top-24 right-3 left-3 z-20 flex min-h-0 flex-col overflow-hidden rounded-lg border border-warm-200 bg-white shadow-xl dark:border-warm-700 dark:bg-warm-900"
style={{
height: `min(22rem, ${floatingPaneAvailableHeight})`,
maxHeight: floatingPaneAvailableHeight,
}}
>
{poiPane}
</div>
)}

View file

@ -39,6 +39,7 @@ export interface MapPageProps {
onNavigateTo: (page: Page, hash?: string, infoFeature?: string) => void;
onExportStateChange?: (state: ExportState) => void;
onDashboardParamsChange?: (params: string) => void;
onDashboardReadyChange?: (ready: boolean) => void;
screenshotMode?: boolean;
ogMode?: boolean;
isMobile?: boolean;

View file

@ -1,4 +1,4 @@
import { useState, useCallback, useEffect } from 'react';
import { useState, useCallback, useEffect, useId } from 'react';
import { useTranslation } from 'react-i18next';
import { CloseIcon } from './icons/CloseIcon';
import { GoogleIcon } from './icons/GoogleIcon';
@ -36,6 +36,9 @@ export default function AuthModal({
const [password, setPassword] = useState('');
const [resetSent, setResetSent] = useState(false);
const dialogRef = useModalA11y();
const fieldId = useId();
const emailInputId = `${fieldId}-email`;
const passwordInputId = `${fieldId}-password`;
useEffect(() => {
trackEvent('Auth Modal Open', { tab: initialTab });
@ -194,14 +197,20 @@ export default function AuthModal({
{/* Email form */}
<form onSubmit={handleSubmit} className="space-y-4">
<div>
<label className="block text-sm font-medium text-warm-700 dark:text-warm-300 mb-1">
<label
htmlFor={emailInputId}
className="block text-sm font-medium text-warm-700 dark:text-warm-300 mb-1"
>
{t('auth.email')}
</label>
<input
id={emailInputId}
name="email"
type="email"
value={email}
onChange={(e) => setEmail(e.target.value)}
required
autoComplete="email"
className="w-full px-3 py-2 text-sm rounded border border-warm-200 dark:border-warm-700 bg-white dark:bg-warm-800 text-navy-950 dark:text-white placeholder-warm-400 dark:placeholder-warm-500 outline-none focus:ring-2 ring-teal-400 dark:ring-teal-500"
placeholder={t('auth.emailPlaceholder')}
/>
@ -209,15 +218,21 @@ export default function AuthModal({
{view !== 'forgot' && (
<div>
<label className="block text-sm font-medium text-warm-700 dark:text-warm-300 mb-1">
<label
htmlFor={passwordInputId}
className="block text-sm font-medium text-warm-700 dark:text-warm-300 mb-1"
>
{t('auth.password')}
</label>
<input
id={passwordInputId}
name="password"
type="password"
value={password}
onChange={(e) => setPassword(e.target.value)}
required
minLength={8}
autoComplete={view === 'register' ? 'new-password' : 'current-password'}
className="w-full px-3 py-2 text-sm rounded border border-warm-200 dark:border-warm-700 bg-white dark:bg-warm-800 text-navy-950 dark:text-white placeholder-warm-400 dark:placeholder-warm-500 outline-none focus:ring-2 ring-teal-400 dark:ring-teal-500"
placeholder={
view === 'register'

View file

@ -78,6 +78,7 @@ export default function Header({
onToggleTheme,
exportState,
dashboardParams,
dashboardActionsDisabled = false,
onSaveSearch,
savingSearch,
editingSearch,
@ -96,6 +97,7 @@ export default function Header({
onToggleTheme: () => void;
exportState: HeaderExportState | null;
dashboardParams: string;
dashboardActionsDisabled?: boolean;
onSaveSearch: (() => void) | null;
savingSearch: boolean;
editingSearch: EditingSearchState | null;
@ -116,6 +118,7 @@ export default function Header({
() => window.matchMedia(DASHBOARD_TABLET_SIDEBAR_QUERY).matches
);
const useSidebarNav = isMobile || (activePage === 'dashboard' && isDashboardTabletSidebarWidth);
const dashboardActionsBlocked = activePage === 'dashboard' && (!user || dashboardActionsDisabled);
useEffect(() => {
const mql = window.matchMedia(DASHBOARD_TABLET_SIDEBAR_QUERY);
@ -139,6 +142,10 @@ export default function Header({
if (!useSidebarNav) setMenuOpen(false);
}, [useSidebarNav]);
useEffect(() => {
if (dashboardActionsBlocked) setExportMenuOpen(false);
}, [dashboardActionsBlocked]);
const doCopy = useCallback((text: string) => {
copyToClipboard(text, () => {
setCopied(true);
@ -147,6 +154,7 @@ export default function Header({
}, []);
const handleShare = useCallback(async () => {
if (dashboardActionsBlocked) return;
const params =
activePage === 'dashboard' ? dashboardParams : window.location.search.replace(/^\?/, '');
if (!params) {
@ -167,7 +175,7 @@ export default function Header({
} finally {
setSharing(false);
}
}, [activePage, dashboardParams, doCopy, i18n.language]);
}, [activePage, dashboardActionsBlocked, dashboardParams, doCopy, i18n.language]);
const navLink = (page: Page, e: React.MouseEvent, hash?: string) => {
if (e.metaKey || e.ctrlKey || e.shiftKey || e.button !== 0) return;
@ -206,8 +214,8 @@ export default function Header({
</button>
<button
onClick={onUpdateEdit}
disabled={savingSearch}
className="cursor-pointer px-3 py-1.5 rounded bg-teal-600 hover:bg-teal-700 transition-colors text-sm font-medium disabled:opacity-50 disabled:cursor-wait flex items-center gap-1.5"
disabled={savingSearch || dashboardActionsBlocked}
className="cursor-pointer px-3 py-1.5 rounded bg-teal-600 hover:bg-teal-700 transition-colors text-sm font-medium disabled:opacity-50 disabled:cursor-not-allowed flex items-center gap-1.5"
>
{savingSearch && <SpinnerIcon className="w-4 h-4 animate-spin" />}
{savingSearch ? t('savedPage.updating') : t('common.update')}
@ -216,14 +224,16 @@ export default function Header({
</div>
)}
{/* Left: Logo + nav */}
<div className="flex items-center gap-4">
<div className="flex min-w-0 items-center gap-4">
<a
href="/"
className="flex cursor-pointer items-center gap-2 hover:opacity-80 transition-opacity"
className="flex min-w-0 cursor-pointer items-center gap-2 hover:opacity-80 transition-opacity"
onClick={(e) => navLink('home', e)}
>
<LogoIcon className="w-5 h-5 text-teal-400" />
<span className="text-lg font-semibold text-teal-300">{t('header.appName')}</span>
<LogoIcon className="w-5 h-5 shrink-0 text-teal-400" />
<span className="max-w-[9rem] truncate whitespace-nowrap text-lg font-semibold text-teal-300 sm:max-w-none">
{t('header.appName')}
</span>
</a>
{/* Desktop nav */}
@ -266,14 +276,14 @@ export default function Header({
</div>
{/* Right side */}
<div className="flex items-center gap-2 ml-auto">
<div className="ml-auto flex shrink-0 items-center gap-2">
{/* Desktop-only dashboard actions */}
{!useSidebarNav && activePage === 'dashboard' && (
{!useSidebarNav && activePage === 'dashboard' && user && (
<>
<button
onClick={handleShare}
disabled={sharing}
className="flex cursor-pointer items-center gap-1.5 px-3 py-1.5 rounded bg-navy-800 hover:bg-navy-700 transition-colors text-sm disabled:opacity-50"
disabled={sharing || dashboardActionsBlocked}
className="flex cursor-pointer items-center gap-1.5 px-3 py-1.5 rounded bg-navy-800 hover:bg-navy-700 transition-colors text-sm disabled:cursor-not-allowed disabled:opacity-50"
>
{sharing ? (
<>
@ -295,8 +305,8 @@ export default function Header({
{exportState && (
<button
onClick={() => setExportMenuOpen(true)}
disabled={exportState.exporting}
className="flex cursor-pointer items-center gap-1.5 px-3 py-1.5 rounded bg-navy-800 hover:bg-navy-700 transition-colors text-sm disabled:opacity-50"
disabled={exportState.exporting || dashboardActionsBlocked}
className="flex cursor-pointer items-center gap-1.5 px-3 py-1.5 rounded bg-navy-800 hover:bg-navy-700 transition-colors text-sm disabled:cursor-not-allowed disabled:opacity-50"
title={t('header.exportToExcel')}
>
<DownloadIcon className="w-4 h-4" />
@ -306,8 +316,8 @@ export default function Header({
{onSaveSearch && !editingSearch && (
<button
onClick={onSaveSearch}
disabled={savingSearch}
className="flex cursor-pointer items-center gap-1.5 px-3 py-1.5 rounded bg-navy-800 hover:bg-navy-700 transition-colors text-sm disabled:opacity-50"
disabled={savingSearch || dashboardActionsBlocked}
className="flex cursor-pointer items-center gap-1.5 px-3 py-1.5 rounded bg-navy-800 hover:bg-navy-700 transition-colors text-sm disabled:cursor-not-allowed disabled:opacity-50"
>
{savingSearch ? (
<SpinnerIcon className="w-4 h-4 animate-spin" />
@ -363,7 +373,7 @@ export default function Header({
{useSidebarNav && !user && (
<button
onClick={onRegisterClick}
className="cursor-pointer px-4 py-1.5 rounded bg-teal-600 hover:bg-teal-700 transition-colors text-sm font-semibold"
className="flex h-8 max-w-[8.5rem] shrink-0 cursor-pointer items-center justify-center truncate whitespace-nowrap rounded bg-teal-600 px-2.5 text-xs font-semibold leading-none transition-colors hover:bg-teal-700 sm:max-w-none sm:px-3 sm:text-sm"
>
{t('header.createAccount')}
</button>
@ -410,6 +420,7 @@ export default function Header({
onToggleTheme={onToggleTheme}
exportState={exportState}
onOpenExportMenu={() => setExportMenuOpen(true)}
dashboardActionsDisabled={dashboardActionsBlocked}
onSaveSearch={onSaveSearch}
savingSearch={savingSearch}
isEditingSearch={!!editingSearch}

View file

@ -20,6 +20,7 @@ interface MobileMenuProps {
onToggleTheme: () => void;
exportState: HeaderExportState | null;
onOpenExportMenu: () => void;
dashboardActionsDisabled?: boolean;
onSaveSearch: (() => void) | null;
savingSearch: boolean;
isEditingSearch: boolean;
@ -41,6 +42,7 @@ export default function MobileMenu({
onToggleTheme,
exportState,
onOpenExportMenu,
dashboardActionsDisabled = false,
onSaveSearch,
savingSearch,
isEditingSearch,
@ -101,7 +103,7 @@ export default function MobileMenu({
</a>
);
const dashboardActions = activePage === 'dashboard' && (
const dashboardActions = activePage === 'dashboard' && user && (
<div className="px-2 py-2 border-b border-navy-700">
<div className="grid grid-cols-2 gap-2">
<button
@ -109,7 +111,7 @@ export default function MobileMenu({
onShare();
onClose();
}}
disabled={sharing}
disabled={sharing || dashboardActionsDisabled}
className={dashboardActionClass}
>
{sharing ? (
@ -127,8 +129,8 @@ export default function MobileMenu({
onClose();
onOpenExportMenu();
}}
disabled={exportState.exporting}
className={dashboardActionClass}
disabled={exportState.exporting || dashboardActionsDisabled}
>
<DownloadIcon className="w-4 h-4" />
{exportState.exporting ? t('header.exporting') : t('header.exportLabel')}
@ -140,7 +142,7 @@ export default function MobileMenu({
onSaveSearch();
onClose();
}}
disabled={savingSearch}
disabled={savingSearch || dashboardActionsDisabled}
className={dashboardActionClass}
>
{savingSearch ? (
@ -199,7 +201,7 @@ export default function MobileMenu({
</button>
{/* Language selector */}
<div className="flex max-w-full gap-1 overflow-x-auto overflow-y-hidden px-3 pb-1 scrollbar-hide">
<div className="grid max-w-full grid-cols-3 gap-1 px-3 pb-1">
{SUPPORTED_LANGUAGES.map((lang) => (
<button
key={lang.code}
@ -208,7 +210,7 @@ export default function MobileMenu({
localStorage.setItem('language', lang.code);
void changeAppLanguage(lang.code);
}}
className={`flex-none min-w-[2.5rem] flex cursor-pointer items-center justify-center gap-1.5 px-2 py-1.5 rounded text-sm ${
className={`flex min-w-0 cursor-pointer items-center justify-center gap-1.5 rounded px-2 py-1.5 text-sm ${
i18n.language === lang.code
? 'bg-navy-700 text-white font-medium'
: 'text-warm-400 hover:bg-navy-800 hover:text-white'

View file

@ -3,12 +3,17 @@ import type { ReactNode } from 'react';
interface PillGroupProps {
children: ReactNode;
className?: string;
wrap?: boolean;
}
export function PillGroup({ children, className = '' }: PillGroupProps) {
export function PillGroup({ children, className = '', wrap = false }: PillGroupProps) {
return (
<div
className={`flex min-w-0 max-w-full flex-nowrap gap-1.5 overflow-x-auto overscroll-x-contain touch-pan-x touch-pan-y scrollbar-hide md:flex-wrap md:overflow-x-visible ${className}`}
className={`flex min-w-0 max-w-full gap-1.5 ${
wrap
? 'flex-wrap overflow-x-visible'
: 'flex-nowrap overflow-x-auto overscroll-x-contain touch-pan-x touch-pan-y scrollbar-hide md:flex-wrap md:overflow-x-visible'
} ${className}`}
>
{children}
</div>

View file

@ -1,5 +1,6 @@
import { useRef } from 'react';
import { createPortal } from 'react-dom';
import { useTranslation } from 'react-i18next';
import type React from 'react';
import type { SearchResult } from '../../hooks/useLocationSearch';
import { useDropdownPosition } from '../../hooks/useDropdownPosition';
@ -13,6 +14,7 @@ interface SearchHook {
activeIndex: number;
setActiveIndex: (idx: number) => void;
open: boolean;
searching?: boolean;
handleInputChange: (value: string) => void;
handleKeyDown: (e: React.KeyboardEvent, onSelect: (result: SearchResult) => void) => void;
showEmptySearches: () => void;
@ -23,6 +25,8 @@ interface PlaceSearchInputProps {
onSelect: (result: SearchResult) => void;
loading?: boolean;
placeholder?: string;
ariaLabel?: string;
name?: string;
size?: 'sm' | 'xs';
inputClassName?: string;
inputRef?: React.Ref<HTMLInputElement>;
@ -35,19 +39,28 @@ export function PlaceSearchInput({
onSelect,
loading,
placeholder,
ariaLabel,
name,
size = 'sm',
inputClassName,
inputRef,
onInputChange,
portal,
}: PlaceSearchInputProps) {
const { t } = useTranslation();
const sm = size === 'sm';
const iconSize = sm ? 'w-4 h-4' : 'w-3 h-3';
const spinnerSize = sm ? 'w-4 h-4' : 'w-3 h-3';
const wrapperRef = useRef<HTMLDivElement>(null);
const dropdownPos = useDropdownPosition(wrapperRef, portal ? search.open : false);
const showDropdown = search.open && search.results.length > 0;
const showEmptyResults =
search.open &&
!search.searching &&
search.query.trim().length >= 2 &&
search.results.length === 0;
const showDropdown = search.open && (search.results.length > 0 || showEmptyResults);
const showSpinner = loading || search.searching;
const dropdown = showDropdown && (
<div
@ -64,57 +77,66 @@ export function PlaceSearchInput({
: undefined
}
>
{search.results.map((result, idx) => (
<button
key={
result.type === 'postcode'
? `pc-${result.label}`
: result.type === 'address'
? `addr-${result.postcode}-${result.address}-${result.lat}`
: `pl-${result.name}-${result.lat}`
}
type="button"
className={`w-full text-left flex items-center cursor-pointer ${
sm ? 'px-3 py-2 gap-2 text-sm' : 'px-2 py-1.5 gap-1.5 text-xs'
} ${
idx === search.activeIndex
? 'bg-teal-50 dark:bg-teal-900/30'
: 'hover:bg-warm-50 dark:hover:bg-warm-700'
}`}
onMouseEnter={() => search.setActiveIndex(idx)}
onMouseDown={(e) => {
e.preventDefault();
onSelect(result);
}}
{showEmptyResults ? (
<div
className={`text-warm-500 dark:text-warm-400 ${sm ? 'px-3 py-2 text-sm' : 'px-2 py-1.5 text-xs'}`}
role="status"
>
{result.type === 'postcode' ? (
<>
<SearchIcon className={`${iconSize} text-warm-400 dark:text-warm-500 shrink-0`} />
<span className="text-warm-700 dark:text-warm-200">{result.label}</span>
</>
) : result.type === 'address' ? (
<>
<HouseIcon className={`${iconSize} text-warm-400 dark:text-warm-500 shrink-0`} />
<span className="min-w-0 text-warm-700 dark:text-warm-200">
<span className="block truncate">{result.address}</span>
<span className="block truncate text-warm-400 dark:text-warm-500">
{result.postcode}
{t('locationSearch.noResults')}
</div>
) : (
search.results.map((result, idx) => (
<button
key={
result.type === 'postcode'
? `pc-${result.label}`
: result.type === 'address'
? `addr-${result.postcode}-${result.address}-${result.lat}`
: `pl-${result.name}-${result.lat}`
}
type="button"
className={`w-full text-left flex items-center cursor-pointer ${
sm ? 'px-3 py-2 gap-2 text-sm' : 'px-2 py-1.5 gap-1.5 text-xs'
} ${
idx === search.activeIndex
? 'bg-teal-50 dark:bg-teal-900/30'
: 'hover:bg-warm-50 dark:hover:bg-warm-700'
}`}
onMouseEnter={() => search.setActiveIndex(idx)}
onMouseDown={(e) => {
e.preventDefault();
onSelect(result);
}}
>
{result.type === 'postcode' ? (
<>
<SearchIcon className={`${iconSize} text-warm-400 dark:text-warm-500 shrink-0`} />
<span className="text-warm-700 dark:text-warm-200">{result.label}</span>
</>
) : result.type === 'address' ? (
<>
<HouseIcon className={`${iconSize} text-warm-400 dark:text-warm-500 shrink-0`} />
<span className="min-w-0 text-warm-700 dark:text-warm-200">
<span className="block truncate">{result.address}</span>
<span className="block truncate text-warm-400 dark:text-warm-500">
{result.postcode}
</span>
</span>
</span>
</>
) : (
<>
<MapPinIcon className={`${iconSize} text-warm-400 dark:text-warm-500 shrink-0`} />
<span className="text-warm-700 dark:text-warm-200">
{result.name}
{result.city && (
<span className="text-warm-400 dark:text-warm-500"> ({result.city})</span>
)}
</span>
</>
)}
</button>
))}
</>
) : (
<>
<MapPinIcon className={`${iconSize} text-warm-400 dark:text-warm-500 shrink-0`} />
<span className="text-warm-700 dark:text-warm-200">
{result.name}
{result.city && (
<span className="text-warm-400 dark:text-warm-500"> ({result.city})</span>
)}
</span>
</>
)}
</button>
))
)}
</div>
);
@ -123,6 +145,7 @@ export function PlaceSearchInput({
<input
ref={inputRef}
type="text"
name={name}
value={search.query}
onChange={(e) => {
search.handleInputChange(e.target.value);
@ -132,11 +155,12 @@ export function PlaceSearchInput({
search.showEmptySearches();
}}
onKeyDown={(e) => search.handleKeyDown(e, onSelect)}
aria-label={ariaLabel ?? placeholder}
placeholder={placeholder}
className={inputClassName}
/>
{loading && (
{showSpinner && (
<div
className={`absolute right-2 top-1/2 -translate-y-1/2 ${spinnerSize} border-2 border-warm-300 dark:border-warm-600 border-t-teal-500 rounded-full animate-spin`}
/>

View file

@ -4,18 +4,27 @@ interface SearchInputProps {
value: string;
onChange: (value: string) => void;
placeholder?: string;
ariaLabel?: string;
className?: string;
}
export function SearchInput({ value, onChange, placeholder, className = '' }: SearchInputProps) {
export function SearchInput({
value,
onChange,
placeholder,
ariaLabel,
className = '',
}: SearchInputProps) {
const { t } = useTranslation();
const inputPlaceholder = placeholder ?? t('common.search');
return (
<input
type="text"
value={value}
onChange={(e) => onChange(e.target.value)}
placeholder={placeholder ?? t('common.search')}
placeholder={inputPlaceholder}
aria-label={ariaLabel ?? inputPlaceholder}
className={`w-full px-2 py-1 text-sm border rounded bg-white dark:bg-navy-800 dark:text-warm-200 border-warm-200 dark:border-navy-700 placeholder-warm-400 dark:placeholder-warm-500 focus:outline-none focus:ring-1 focus:ring-teal-400 ${className}`}
/>
);

View file

@ -13,6 +13,7 @@ const LISTING_CLUSTER_MAX_ZOOM = 24;
const LISTING_CLUSTER_POPUP_LIMIT = 30;
const LISTING_SPIDERFY_LIMIT = 12;
const TILE_SIZE = 512;
const PRICE_LABEL_CHARACTER_SET = '£0123456789.kM';
interface SingleListingPopupInfo {
mode: 'single';
@ -472,6 +473,7 @@ export function useListingLayers({ listings, zoom, isDark }: UseListingLayersPro
outlineWidth: 3,
outlineColor: isDark ? [10, 10, 10, 220] : [255, 255, 255, 230],
fontSettings: { sdf: true },
characterSet: PRICE_LABEL_CHARACTER_SET,
sizeUnits: 'pixels',
sizeMinPixels: 10,
sizeMaxPixels: 14,

View file

@ -160,6 +160,7 @@ export function useLocationSearch(mode?: string) {
const [recentSearches, setRecentSearches] = useState<SearchResult[]>(readRecentSearches);
const [activeIndex, setActiveIndex] = useState(-1);
const [open, setOpen] = useState(false);
const [searching, setSearching] = useState(false);
const abortRef = useRef<AbortController | null>(null);
const debounceRef = useRef<ReturnType<typeof setTimeout> | null>(null);
const latestQueryRef = useRef('');
@ -176,6 +177,7 @@ export function useLocationSearch(mode?: string) {
const trimmed = value.trim();
if (!trimmed) {
setSearching(false);
setResults(recentSearches);
lastResultsRef.current = [];
setOpen(recentSearches.length > 0);
@ -183,6 +185,7 @@ export function useLocationSearch(mode?: string) {
}
if (!mode && looksLikePostcode(trimmed)) {
setSearching(false);
const postcodeResults: SearchResult[] = [
{ type: 'postcode', label: normalizePostcode(trimmed) },
];
@ -192,6 +195,7 @@ export function useLocationSearch(mode?: string) {
}
if (trimmed.length < 2) {
setSearching(false);
setResults([]);
setOpen(false);
return;
@ -200,6 +204,7 @@ export function useLocationSearch(mode?: string) {
const locallyFilteredResults = filterResultsForQuery(lastResultsRef.current, trimmed);
setResults(locallyFilteredResults);
setOpen(locallyFilteredResults.length > 0);
setSearching(true);
debounceRef.current = setTimeout(async () => {
const controller = new AbortController();
@ -211,7 +216,13 @@ export function useLocationSearch(mode?: string) {
`/api/places?${params}`,
authHeaders({ signal: controller.signal })
);
if (!res.ok) return;
if (!res.ok) {
if (!controller.signal.aborted && latestQueryRef.current.trim() === trimmed) {
setResults([]);
setOpen(true);
}
return;
}
const json: {
places: PlaceResult[];
postcodes?: string[];
@ -253,9 +264,17 @@ export function useLocationSearch(mode?: string) {
lastResultsRef.current = combinedResults;
const matchingResults = filterResultsForQuery(combinedResults, trimmed);
setResults(matchingResults);
setOpen(matchingResults.length > 0);
setOpen(true);
} catch (err) {
logNonAbortError('places search', err);
if (!controller.signal.aborted && latestQueryRef.current.trim() === trimmed) {
setResults([]);
setOpen(true);
}
} finally {
if (!controller.signal.aborted && latestQueryRef.current.trim() === trimmed) {
setSearching(false);
}
}
}, 200);
},
@ -264,7 +283,7 @@ export function useLocationSearch(mode?: string) {
const showEmptySearches = useCallback(() => {
if (latestQueryRef.current.trim()) {
setOpen(results.length > 0);
setOpen(results.length > 0 || latestQueryRef.current.trim().length >= 2);
return;
}
@ -278,6 +297,7 @@ export function useLocationSearch(mode?: string) {
const clear = useCallback(() => {
setQuery('');
latestQueryRef.current = '';
setSearching(false);
setResults([]);
lastResultsRef.current = [];
setOpen(false);
@ -308,6 +328,8 @@ export function useLocationSearch(mode?: string) {
e.preventDefault();
if (activeIndex >= 0 && activeIndex < results.length) {
onSelect(results[activeIndex]);
} else if (results.length > 0) {
onSelect(results[0]);
} else if (looksLikePostcode(query)) {
onSelect({ type: 'postcode', label: normalizePostcode(query) });
}
@ -332,6 +354,7 @@ export function useLocationSearch(mode?: string) {
activeIndex,
setActiveIndex,
open,
searching,
setOpen,
handleInputChange,
handleKeyDown,

View file

@ -916,6 +916,7 @@ const de: Translations = {
// ── Location Search ────────────────────────────────
locationSearch: {
placeholder: 'Orte oder Postcodes suchen...',
noResults: 'Keine passenden Orte oder Postcodes',
postcodeNotFound: 'Postcode nicht gefunden',
lookupFailed: 'Suche fehlgeschlagen',
searchLabel: 'Orte oder Postcodes suchen',

View file

@ -892,6 +892,7 @@ const en = {
// ── Location Search ────────────────────────────────
locationSearch: {
placeholder: 'Search places or postcodes...',
noResults: 'No matching places or postcodes',
postcodeNotFound: 'Postcode not found',
lookupFailed: 'Lookup failed',
searchLabel: 'Search places or postcodes',

View file

@ -924,6 +924,7 @@ const fr: Translations = {
// ── Location Search ────────────────────────────────
locationSearch: {
placeholder: 'Rechercher des lieux ou codes postaux...',
noResults: 'Aucun lieu ni code postal correspondant',
postcodeNotFound: 'Code postal introuvable',
lookupFailed: 'Échec de la recherche',
searchLabel: 'Rechercher des lieux ou codes postaux',

View file

@ -876,6 +876,7 @@ const hi: Translations = {
locationSearch: {
placeholder: 'स्थान या पोस्टकोड खोजें...',
noResults: 'कोई मिलती-जुलती जगह या पोस्टकोड नहीं मिला',
postcodeNotFound: 'पोस्टकोड नहीं मिला',
lookupFailed: 'खोज विफल रही',
searchLabel: 'स्थान या पोस्टकोड खोजें',

View file

@ -910,6 +910,7 @@ const hu: Translations = {
// ── Location Search ────────────────────────────────
locationSearch: {
placeholder: 'Helyek vagy irányítószámok keresése...',
noResults: 'Nincs egyező hely vagy irányítószám',
postcodeNotFound: 'Irányítószám nem található',
lookupFailed: 'A keresés sikertelen',
searchLabel: 'Helyek vagy irányítószámok keresése',

View file

@ -850,6 +850,7 @@ const zh: Translations = {
// ── Location Search ────────────────────────────────
locationSearch: {
placeholder: '搜索地点或邮编...',
noResults: '未找到匹配的地点或邮编',
postcodeNotFound: '未找到该邮编',
lookupFailed: '查询失败',
searchLabel: '搜索地点或邮编',

View file

@ -1,19 +1,33 @@
from __future__ import annotations
import zipfile
import json
import zipfile
import polars as pl
from pipeline.validate_outputs import main
def write_boundary(path, postcodes):
def polygon(offset=0):
x = float(offset)
return {
"type": "Polygon",
"coordinates": [
[(x, 0.0), (x + 0.001, 0.0), (x + 0.001, 0.001), (x, 0.001), (x, 0.0)]
],
}
def write_boundary(path, postcodes, geometries=None):
units = path / "units"
units.mkdir(parents=True)
features = [
{"type": "Feature", "properties": {"postcodes": postcode}, "geometry": None}
for postcode in postcodes
{
"type": "Feature",
"properties": {"postcodes": postcode},
"geometry": (geometries[index] if geometries else polygon(index)),
}
for index, postcode in enumerate(postcodes)
]
(units / "AA1.geojson").write_text(
json.dumps({"type": "FeatureCollection", "features": features})
@ -111,3 +125,100 @@ def test_rejects_postcode_boundary_mismatch(tmp_path, monkeypatch, capsys):
stderr = capsys.readouterr().err
assert "missing boundaries" in stderr
assert "boundary postcodes are absent" in stderr
def test_rejects_invalid_postcode_boundary_features(tmp_path, monkeypatch, capsys):
postcodes_path = tmp_path / "postcodes.parquet"
boundaries_path = tmp_path / "postcode_boundaries"
units = boundaries_path / "units"
units.mkdir(parents=True)
pl.DataFrame({"postcode": ["AA1 1AA"]}).write_parquet(postcodes_path)
bowtie = {
"type": "Polygon",
"coordinates": [[(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)]],
}
features = [
{
"type": "Feature",
"properties": {"postcodes": "AA1 1AA"},
"geometry": polygon(),
},
{
"type": "Feature",
"properties": {"postcodes": "AA1 1AA"},
"geometry": polygon(1),
},
{"type": "Feature", "properties": {}, "geometry": polygon(2)},
{"type": "Feature", "properties": {"postcodes": "AA1 1AB"}, "geometry": None},
{"type": "Feature", "properties": {"postcodes": "AA1 1AC"}, "geometry": bowtie},
]
(units / "AA1.geojson").write_text(
json.dumps({"type": "FeatureCollection", "features": features})
)
monkeypatch.setattr(
"sys.argv",
[
"validate_outputs",
"--postcode-boundary-match",
f"{postcodes_path}::{boundaries_path}",
],
)
assert main() == 1
stderr = capsys.readouterr().err
assert "duplicate boundary postcode features" in stderr
assert "missing properties.postcodes" in stderr
assert "missing or empty geometry" in stderr
assert "invalid boundary geometries" in stderr
def test_validates_active_english_arcgis_boundary_matches(tmp_path, monkeypatch):
arcgis_path = tmp_path / "arcgis.parquet"
boundaries_path = tmp_path / "postcode_boundaries"
pl.DataFrame(
{
"pcds": ["AA1 1AA", "AA1 1AB", "CF1 1AA"],
"ctry25cd": ["E92000001", "E92000001", "W92000004"],
"doterm": [None, "2020-01-01", None],
}
).write_parquet(arcgis_path)
write_boundary(boundaries_path, ["AA1 1AA"])
monkeypatch.setattr(
"sys.argv",
[
"validate_outputs",
"--active-postcode-boundary-match",
f"{arcgis_path}::{boundaries_path}",
],
)
assert main() == 0
def test_rejects_active_english_arcgis_boundary_mismatch(tmp_path, monkeypatch, capsys):
arcgis_path = tmp_path / "arcgis.parquet"
boundaries_path = tmp_path / "postcode_boundaries"
pl.DataFrame(
{
"pcds": ["AA1 1AA", "AA1 1AB", "CF1 1AA"],
"ctry25cd": ["E92000001", "E92000001", "W92000004"],
"doterm": [None, None, None],
}
).write_parquet(arcgis_path)
write_boundary(boundaries_path, ["AA1 1AA", "CF1 1AA"])
monkeypatch.setattr(
"sys.argv",
[
"validate_outputs",
"--active-postcode-boundary-match",
f"{arcgis_path}::{boundaries_path}",
],
)
assert main() == 1
stderr = capsys.readouterr().err
assert "active English postcodes" in stderr
assert "not active English postcodes" in stderr

View file

@ -46,8 +46,21 @@ def _require_tippecanoe() -> str:
return executable
def _write_geojsonseq(csvs: list[Path], output_path: Path) -> int:
df = (
def _write_geojsonseq(csvs: list[Path], output_path: Path) -> tuple[int, int]:
"""Write one weighted GeoJSON point per distinct (anchor, month, type).
Returns ``(feature_count, incident_count)``. police.uk snaps every incident
to a shared "map point" anchor, so many incidents land on the exact same
coordinate. Collapsing them into one feature carrying ``count`` (the number
of incidents) keeps the per-crime-type and per-month filters intact while
turning each hotspot into a single high-weight point. That matters because
tippecanoe's ``--drop-densest-as-needed`` thins *feature density*, not
weight: with one feature per row the busiest streets were silently deleted;
with one weighted feature per anchor those hotspots survive and the dropped
detail is only redundant duplicate points. The heatmap reads ``count`` as
its weight.
"""
grouped = (
pl.scan_csv(
csvs,
schema_overrides={
@ -67,11 +80,15 @@ def _write_geojsonseq(csvs: list[Path], output_path: Path) -> int:
.drop_nulls(["lon", "lat"])
.filter(pl.col("lon").is_between(-9.5, 5.0))
.filter(pl.col("lat").is_between(49.0, 57.0))
.group_by("lon", "lat", "month", "crime_type")
.len()
.rename({"len": "count"})
.collect(engine="streaming")
)
incident_count = int(grouped["count"].sum())
with output_path.open("w") as file:
for row in df.iter_rows(named=True):
for row in grouped.iter_rows(named=True):
feature = {
"type": "Feature",
"geometry": {
@ -79,15 +96,15 @@ def _write_geojsonseq(csvs: list[Path], output_path: Path) -> int:
"coordinates": [row["lon"], row["lat"]],
},
"properties": {
"count": 1,
"weight": 1,
"count": row["count"],
"weight": row["count"],
"month": row["month"],
"crime_type": row["crime_type"],
},
}
file.write(json.dumps(feature, separators=(",", ":")) + "\n")
return df.height
return grouped.height, incident_count
def build_crime_hotspot_tiles(
@ -104,9 +121,10 @@ def build_crime_hotspot_tiles(
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
ndjson_path = Path(tmp) / "crime_hotspots.geojsonseq"
feature_count = _write_geojsonseq(csvs, ndjson_path)
feature_count, incident_count = _write_geojsonseq(csvs, ndjson_path)
print(
f"Writing {feature_count:,} approximate crime heatmap points "
f"Writing {feature_count:,} weighted crime heatmap points "
f"({incident_count:,} incidents) "
f"from {min(selected_months)} to {max(selected_months)}"
)

View file

@ -1,14 +1,25 @@
"""Aggregate police.uk street crime to postcodes by 50m spatial proximity.
"""Aggregate police.uk street crime to postcodes by spatial proximity.
Instead of attributing each incident to its published LSOA code, this transform
counts the anonymised incident *points* that fall within 50m of each postcode's
boundary polygon (the polygon buffered outward by 50m). A point inside several
overlapping buffers counts for each postcode -- the same multiplicity the
tree-density filter uses for features near more than one postcode.
counts the anonymised incident *points* that fall within ``buffer_m`` (default
100m) of each postcode's boundary polygon (the polygon buffered outward). A point
inside several overlapping buffers counts for each postcode -- the same
multiplicity the tree-density filter uses for features near more than one
postcode. The wide 100m buffer deliberately smooths police.uk's snap-to-grid
coordinates, which would otherwise make the count hypersensitive to which side of
a narrow line a shared "map point" anchor happened to land on.
The metric is a raw annualised count ("incidents/year within 50m"); there is no
per-capita denominator. Outputs mirror the old LSOA transform's shape but are
keyed on ``postcode`` instead of ``LSOA code``:
Counts are **area-normalised**: each postcode's count is divided by its buffered
catchment area and rescaled by the median catchment area, so the metric reflects
crime *density* rather than how much ground the buffer sweeps (a median-sized
catchment is left unchanged; a large rural postcode is no longer inflated simply
for covering more of the map). Normalising by the buffered area -- the region
that actually collects points -- rather than the raw polygon keeps tiny unit
postcodes from being over-inflated by the fixed buffer-ring floor. The headline
``"{type} (avg/yr)"`` is the simple mean of the per-year annualised counts, so it
equals the average of the by-year chart bars.
Outputs mirror the old LSOA transform's shape but are keyed on ``postcode``:
* ``crime_by_postcode.parquet`` -- ``postcode`` + ``"{type} (avg/yr)"`` columns.
* ``crime_by_postcode_by_year.parquet`` -- ``postcode`` + ``"{type} (by year)"``
@ -16,14 +27,15 @@ keyed on ``postcode`` instead of ``LSOA code``:
Caveat: police.uk coordinates are snapped to a fixed set of anonymous "map
points", not true locations, and a share of rows have no coordinate at all
(dropped here). Spatial totals are therefore lower than, and fuzzier than, the
old LSOA-tagged counts -- by design, not a regression.
(dropped here). Spatial totals are therefore fuzzier than the old LSOA-tagged
counts -- by design, not a regression.
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path
import numpy as np
@ -41,7 +53,7 @@ from pipeline.transform.postcode_boundaries.loader import load_postcode_polygons
# Serious types first so column order is stable and self-documenting.
ALL_CRIME_TYPES: tuple[str, ...] = SERIOUS_CRIME_TYPES + MINOR_CRIME_TYPES
DEFAULT_BUFFER_M = 50.0
DEFAULT_BUFFER_M = 100.0
MONTH_DIR_RE = re.compile(r"^\d{4}-\d{2}$")
# Generous GB bounds; points outside fall in no English postcode anyway, but
@ -108,10 +120,11 @@ def _accumulate_counts(
"Month": pl.Utf8,
"Crime type": pl.Utf8,
}
known_types = list(type_to_idx)
years = list(year_to_idx)
total_points = 0
total_matches = 0
total_dropped = 0
unknown_type_counts: dict[str, int] = {}
for start in range(0, len(csvs), _CSV_BATCH):
batch = csvs[start : start + _CSV_BATCH]
@ -122,31 +135,47 @@ def _accumulate_counts(
ignore_errors=True,
)
.select("Longitude", "Latitude", "Month", "Crime type")
.with_columns(pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"))
# strict=False: a single malformed Month drops only that row instead
# of aborting the whole build (a non-numeric year becomes null and is
# filtered out by the year membership check below).
.with_columns(
pl.col("Month").str.slice(0, 4).cast(pl.Int32, strict=False).alias("year")
)
.filter(
pl.col("Longitude").is_not_null()
& pl.col("Latitude").is_not_null()
& pl.col("Longitude").is_between(*LON_BOUNDS)
& pl.col("Latitude").is_between(*LAT_BOUNDS)
& pl.col("Crime type").is_in(known_types)
& pl.col("year").is_in(list(year_to_idx))
& pl.col("Crime type").is_not_null()
& (pl.col("Crime type") != "")
& pl.col("year").is_in(years)
)
# Map crime types to indices with default=None so an unrecognised
# type yields a null index we can *report* rather than silently drop
# (the legacy LSOA path surfaced unknown types via its dynamic pivot).
.with_columns(
pl.col("Crime type")
.replace_strict(type_to_idx, return_dtype=pl.Int32)
.replace_strict(type_to_idx, default=None, return_dtype=pl.Int32)
.alias("tidx"),
pl.col("year")
.replace_strict(year_to_idx, return_dtype=pl.Int32)
.alias("yidx"),
)
.select("Longitude", "Latitude", "tidx", "yidx")
.select("Longitude", "Latitude", "Crime type", "tidx", "yidx")
.collect(engine="streaming")
)
rows_in = frame.height
if rows_in == 0:
if frame.height == 0:
continue
unknown = frame.filter(pl.col("tidx").is_null())
if unknown.height:
for name, cnt in unknown.group_by("Crime type").len().iter_rows():
unknown_type_counts[name] = unknown_type_counts.get(name, 0) + cnt
frame = frame.filter(pl.col("tidx").is_not_null())
if frame.height == 0:
continue
lon = frame["Longitude"].to_numpy()
lat = frame["Latitude"].to_numpy()
tidx = frame["tidx"].to_numpy()
@ -177,6 +206,20 @@ def _accumulate_counts(
if total_dropped:
print(f"Dropped {total_dropped:,} points outside the BNG transform domain")
if unknown_type_counts:
total_unknown = sum(unknown_type_counts.values())
listed = ", ".join(
f"{name!r} ({cnt:,})"
for name, cnt in sorted(
unknown_type_counts.items(), key=lambda kv: kv[1], reverse=True
)
)
print(
f"WARNING: dropped {total_unknown:,} incidents with crime types not in "
f"ALL_CRIME_TYPES (taxonomy is stale -- update SERIOUS/MINOR_CRIME_TYPES): "
f"{listed}",
file=sys.stderr,
)
def _rollup_long(
@ -195,12 +238,27 @@ def _rollup_long(
def _write_avg_yr(
postcodes: np.ndarray,
counts: np.ndarray,
valid_month_count: int,
years: list[int],
months_in_year: dict[int, int],
norm: np.ndarray,
output_path: Path,
) -> None:
"""Write ``postcode`` + ``"{type} (avg/yr)"`` annualised totals."""
totals = counts.sum(axis=2) # (n_postcodes, n_types)
avg = np.round(totals / valid_month_count * 12.0, 1).astype(np.float32)
"""Write ``postcode`` + ``"{type} (avg/yr)"`` density-normalised averages.
The headline figure is the **simple mean of the per-year annualised counts**
(each year scaled to a 12-month equivalent), so it equals the average of the
by-year chart bars instead of a month-weighted pooled rate. Each postcode's
value is then multiplied by ``norm`` (median_area / buffered catchment area)
so the metric is a density rather than a footprint-inflated raw count.
"""
months = np.array([months_in_year[year] for year in years], dtype=np.float64)
per_year = counts.astype(np.float64) * 12.0 / months[None, None, :]
# Average over the years each type is actually observed anywhere -- the same
# per-type x-span the by-year chart plots (server-rs/.../crime_by_year.rs).
type_year_present = counts.sum(axis=0) > 0 # (n_types, n_years)
years_per_type = np.clip(type_year_present.sum(axis=1), 1, None).astype(np.float64)
avg = per_year.sum(axis=2) / years_per_type[None, :] # (n_postcodes, n_types)
avg = np.round(avg * norm[:, None], 1).astype(np.float32)
data: dict[str, np.ndarray] = {"postcode": postcodes}
for type_idx, name in enumerate(ALL_CRIME_TYPES):
@ -216,11 +274,20 @@ def _write_by_year(
counts: np.ndarray,
years: list[int],
months_in_year: dict[int, int],
norm: np.ndarray,
output_path: Path,
) -> None:
"""Write nested ``"{type} (by year)"`` series plus Serious/Minor rollups."""
"""Write nested ``"{type} (by year)"`` series plus Serious/Minor rollups.
Per-year counts are area-normalised by the same ``norm`` (median_area /
buffered catchment area) factor applied to the avg/yr headline, so the chart
bars and the headline figure remain mutually consistent.
"""
months = np.array([months_in_year[year] for year in years], dtype=np.float64)
annual = np.round(counts.astype(np.float64) * 12.0 / months[None, None, :], 1)
annual = np.round(
counts.astype(np.float64) * 12.0 / months[None, None, :] * norm[:, None, None],
1,
)
pc_i, ty_i, yr_i = np.nonzero(counts)
if pc_i.size == 0:
@ -278,8 +345,27 @@ def transform_crime_spatial(
)
postcodes, polygons = load_postcode_polygons(boundaries_dir, max_postcodes)
print(f"Buffering {len(postcodes):,} postcode polygons by {buffer_m:g}m...")
_buffers, tree = _build_tree(polygons, buffer_m)
buffers, tree = _build_tree(polygons, buffer_m)
# Area-normalisation factor (median_area / catchment_area): divides out the
# size of each postcode's catchment so the count measures crime density, not
# how much ground the buffer sweeps. We normalise by the *buffered* area --
# the region that actually collects points -- rather than the raw polygon, so
# a tiny unit postcode isn't over-inflated by the fixed buffer-ring floor.
# Buffers are in EPSG:27700, so shapely.area is in m^2.
areas = shapely.area(buffers).astype(np.float64)
usable_area = np.isfinite(areas) & (areas > 0)
if not usable_area.any():
raise ValueError("No postcode buffers have a positive area to normalise by")
median_area = float(np.median(areas[usable_area]))
norm = np.zeros(len(postcodes), dtype=np.float64)
norm[usable_area] = median_area / areas[usable_area]
print(
f"Area-normalising to median catchment area {median_area:,.0f} m^2 "
f"({int(usable_area.sum()):,}/{len(areas):,} postcodes have usable area)"
)
type_to_idx = {name: idx for idx, name in enumerate(ALL_CRIME_TYPES)}
year_to_idx = {year: idx for idx, year in enumerate(years)}
@ -288,8 +374,8 @@ def transform_crime_spatial(
transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
_accumulate_counts(csvs, tree, type_to_idx, year_to_idx, transformer, counts)
_write_avg_yr(postcodes, counts, valid_month_count, output_path)
_write_by_year(postcodes, counts, years, months_in_year, by_year_output_path)
_write_avg_yr(postcodes, counts, years, months_in_year, norm, output_path)
_write_by_year(postcodes, counts, years, months_in_year, norm, by_year_output_path)
def main() -> None:

View file

@ -116,6 +116,66 @@ TREE_DENSITY_FEATURE = "Street tree density percentile"
_POSTCODE_TREE_DENSITY_PERCENTILE_RE = re.compile(
r"^Tree canopy density percentile within \d+m$"
)
_FINAL_DROP_COLUMNS = [
"inspection_date",
"_bedrooms",
"LSOA name (2021)",
"Local Authority District code (2024)",
"Local Authority District name (2024)",
"Wider Barriers Sub-domain Score",
"Geographical Barriers Sub-domain Score",
"Adult Skills Sub-domain Score",
"Children and Young People Sub-domain Score",
"Crime Score",
"Living Environment Score",
"Index of Multiple Deprivation (IMD) Score",
"Income Deprivation Affecting Older People (IDAOPI) Score (rate)",
"Income Deprivation Affecting Children Index (IDACI) Score (rate)",
"Barriers to Housing and Services Score",
"oa21",
"pcon",
"epc_property_type",
"pp_property_type",
"built_form",
]
_FINAL_RENAME_COLUMNS = {
"date_of_transfer": "Date of last transaction",
"construction_age_band": "Construction year",
"is_construction_date_approximate": "Is construction date approximate",
"Income Score (rate)": "Income Score",
"Employment Score (rate)": "Employment Score",
"Indoors Sub-domain Score": "Housing Conditions Score",
"Outdoors Sub-domain Score": "Air Quality and Road Safety Score",
"pp_address": "Address per Property Register",
"epc_address": "Address per EPC",
"postcode": "Postcode",
"duration": "Leasehold/Freehold",
"current_energy_rating": "Current energy rating",
"potential_energy_rating": "Potential energy rating",
"total_floor_area": "Total floor area (sqm)",
"property_type": "Property type",
"restaurants_2km": "Number of restaurants within 2km",
"groceries_2km": "Number of grocery shops and supermarkets within 2km",
"latest_price": "Last known price",
"number_habitable_rooms": "Number of bedrooms & living rooms",
"noise_lden_db": "Noise (dB)",
"good_primary_5km": "Good+ primary schools within 5km",
"good_secondary_5km": "Good+ secondary schools within 5km",
"good_primary_2km": "Good+ primary schools within 2km",
"good_secondary_2km": "Good+ secondary schools within 2km",
"outstanding_primary_5km": "Outstanding primary schools within 5km",
"outstanding_secondary_5km": "Outstanding secondary schools within 5km",
"outstanding_primary_2km": "Outstanding primary schools within 2km",
"outstanding_secondary_2km": "Outstanding secondary schools within 2km",
"max_download_speed": "Max available download speed (Mbps)",
"serious_crime_avg_yr": "Serious crime (avg/yr)",
"minor_crime_avg_yr": "Minor crime (avg/yr)",
"mean_monthly_rent": "Estimated monthly rent",
"floor_height": "Interior height (m)",
"was_council_house": "Former council house",
"median_age": "Median age",
"turnout_pct": "Voter turnout (%)",
}
_RENT_SOURCE_UNAVAILABLE_LADS = {
# ONS PIPR does not publish LAD-level private-rent estimates for these
# small authorities. Keep rent null there, but fail on any other LAD miss.
@ -707,6 +767,181 @@ def _validate_property_postcodes(df: pl.DataFrame) -> None:
)
def _active_english_postcode_area(arcgis_raw: pl.LazyFrame) -> pl.LazyFrame:
"""Return the supported postcode universe with geography join keys."""
return (
arcgis_raw.filter(pl.col("ctry25cd") == "E92000001")
.filter(pl.col("doterm").is_null())
.select(
pl.col("pcds").alias("postcode"),
"lat",
pl.col("long").alias("lon"),
"ctry25cd",
pl.col("lsoa21cd").alias("lsoa21"),
pl.col("oa21cd").alias("oa21"),
pl.col("pcon24cd").alias("pcon"),
)
.drop_nulls(["postcode"])
.unique(["postcode"])
)
def _remap_terminated_postcodes(
wide: pl.LazyFrame, postcode_mapping: pl.LazyFrame
) -> pl.LazyFrame:
return (
wide.join(
postcode_mapping,
left_on="postcode",
right_on="old_postcode",
how="left",
)
.with_columns(
pl.coalesce("new_postcode", "postcode").alias("postcode"),
)
.drop("new_postcode")
)
def _filter_to_active_english_postcodes(
wide: pl.LazyFrame, active_postcodes: pl.LazyFrame
) -> pl.LazyFrame:
return wide.join(active_postcodes, on="postcode", how="semi")
def _join_area_side_tables(
base: pl.LazyFrame,
*,
iod: pl.LazyFrame,
ethnicity: pl.LazyFrame,
crime: pl.LazyFrame,
median_age: pl.LazyFrame,
election: pl.LazyFrame,
poi_counts: pl.LazyFrame,
noise: pl.LazyFrame,
school_proximity: pl.LazyFrame,
conservation_areas: pl.LazyFrame,
tree_density: pl.LazyFrame | None,
broadband: pl.LazyFrame,
) -> pl.LazyFrame:
base = base.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
base = base.join(
ethnicity,
left_on="Local Authority District code (2024)",
right_on="Geography_code",
how="left",
)
# Crime is counted spatially per postcode (incidents within 50m of the
# postcode boundary), so it joins on postcode rather than LSOA.
base = base.join(crime, on="postcode", how="left")
base = base.with_columns(
pl.sum_horizontal(
"Violence and sexual offences (avg/yr)",
"Robbery (avg/yr)",
"Burglary (avg/yr)",
"Possession of weapons (avg/yr)",
).alias("serious_crime_avg_yr"),
pl.sum_horizontal(
"Anti-social behaviour (avg/yr)",
"Criminal damage and arson (avg/yr)",
"Shoplifting (avg/yr)",
"Bicycle theft (avg/yr)",
"Theft from the person (avg/yr)",
"Other theft (avg/yr)",
"Vehicle crime (avg/yr)",
"Public order (avg/yr)",
"Drugs (avg/yr)",
"Other crime (avg/yr)",
).alias("minor_crime_avg_yr"),
)
base = base.join(median_age, on="lsoa21", how="left")
base = base.join(election, on="pcon", how="left")
base = base.join(poi_counts, on="postcode", how="left")
base = base.join(noise, on="postcode", how="left")
base = base.join(school_proximity, on="postcode", how="left")
base = base.join(conservation_areas, on="postcode", how="left").with_columns(
pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
)
if tree_density is not None:
base = base.join(tree_density, on="postcode", how="left")
return base.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
def _finalize_merged_columns(frame: pl.LazyFrame) -> pl.LazyFrame:
return frame.drop(_FINAL_DROP_COLUMNS, strict=False).rename(
_FINAL_RENAME_COLUMNS, strict=False
)
def _area_columns_from(columns: list[str]) -> list[str]:
return [
c for c in columns if c in _AREA_COLUMNS or _is_dynamic_poi_metric_column(c)
]
def _property_columns_from(columns: list[str]) -> list[str]:
return [
c
for c in columns
if (c not in _AREA_COLUMNS and not _is_dynamic_poi_metric_column(c))
or c == "Postcode"
]
def _validate_postcode_feature_output(
postcode_df: pl.DataFrame, expected_postcode_count: int
) -> None:
required = {"Postcode", "lat", "lon", "ctry25cd"}
missing = sorted(required - set(postcode_df.columns))
if missing:
raise ValueError(f"Postcode feature output missing columns: {missing}")
unique_count = postcode_df["Postcode"].n_unique()
if (
postcode_df.height != expected_postcode_count
or unique_count != expected_postcode_count
):
raise ValueError(
"Postcode feature output no longer matches the active England "
"postcode universe: "
f"rows={postcode_df.height:,}, unique={unique_count:,}, "
f"expected={expected_postcode_count:,}"
)
invalid = postcode_df.filter(
pl.col("Postcode").is_null()
| (pl.col("Postcode").cast(pl.Utf8).str.strip_chars() == "")
| pl.col("lat").is_null()
| pl.col("lon").is_null()
| pl.col("ctry25cd").is_null()
| (pl.col("ctry25cd") != "E92000001")
)
if invalid.height > 0:
sample = (
invalid.select("Postcode", "ctry25cd", "lat", "lon").head(10).to_dicts()
)
raise ValueError(
"Postcode feature output contains unsupported or ungeocoded rows: "
f"{invalid.height} rows. Sample: {sample}"
)
def _split_normal_outputs(
df: pl.DataFrame,
postcode_features: pl.DataFrame,
*,
expected_postcode_count: int,
) -> tuple[pl.DataFrame, pl.DataFrame]:
postcode_df = postcode_features.select(
_area_columns_from(postcode_features.columns)
)
_validate_postcode_feature_output(postcode_df, expected_postcode_count)
properties_df = df.select(_property_columns_from(df.columns))
return postcode_df, properties_df
# Map listings-parquet source columns to the `_actual_*` overlay columns
# carried alongside the wide frame through the postcode-keyed joins. After the
# rest of the pipeline finalises, listing rows pick their canonical dashboard
@ -927,9 +1162,7 @@ def _best_listing_match(
return best, float(best_score), "address", best_field
def _load_listings_for_merge(
listings_path: Path, arcgis_path: Path
) -> pl.DataFrame:
def _load_listings_for_merge(listings_path: Path, arcgis_path: Path) -> pl.DataFrame:
"""Read the listings parquet and prepare it for the wide-frame merge.
Output is keyed by `_listing_idx` and carries:
@ -1032,7 +1265,11 @@ def _load_direct_epc_candidates(
"_direct_epc_outcode": pl.Utf8,
"_direct_epc_canonical_property_type": pl.Utf8,
"_direct_epc_uprn": pl.Utf8,
**{column: dtype for column, dtype in _DIRECT_EPC_COLUMNS if column.startswith("_direct_")},
**{
column: dtype
for column, dtype in _DIRECT_EPC_COLUMNS
if column.startswith("_direct_")
},
}
if not listing_outcodes:
return pl.DataFrame(schema=schema)
@ -1089,9 +1326,7 @@ def _load_direct_epc_candidates(
pl.col("epc_address").alias("_direct_epc_address"),
pl.col("uprn").alias("_direct_epc_uprn"),
pl.col("total_floor_area").alias("_direct_total_floor_area"),
pl.col("number_habitable_rooms").alias(
"_direct_number_habitable_rooms"
),
pl.col("number_habitable_rooms").alias("_direct_number_habitable_rooms"),
pl.col("floor_height").alias("_direct_floor_height"),
pl.col("_direct_was_council_house").fill_null("No"),
)
@ -1141,9 +1376,7 @@ def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame:
)
def _optional_lazy_col(
schema: pl.Schema, column: str, dtype: pl.DataType
) -> pl.Expr:
def _optional_lazy_col(schema: pl.Schema, column: str, dtype: pl.DataType) -> pl.Expr:
if column in schema:
return pl.col(column).cast(dtype, strict=False).alias(column)
return pl.lit(None, dtype=dtype).alias(column)
@ -1640,27 +1873,18 @@ def _build(
| (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
)
# Remap terminated postcodes to nearest active successor
# Remap terminated postcodes to nearest active successor before filtering to
# the supported active-English postcode universe. Historical properties from
# terminated English postcodes are retained under their successor postcode.
postcode_mapping = build_postcode_mapping(arcgis_path)
wide = (
wide.join(
postcode_mapping.lazy(),
left_on="postcode",
right_on="old_postcode",
how="left",
)
.with_columns(
pl.coalesce("new_postcode", "postcode").alias("postcode"),
)
.drop("new_postcode")
)
wide = _remap_terminated_postcodes(wide, postcode_mapping.lazy())
arcgis_raw = pl.scan_parquet(arcgis_path)
postcode_country = arcgis_raw.select(
pl.col("pcds").alias("postcode"),
pl.col("ctry25cd"),
).unique(["postcode"])
wide = wide.join(postcode_country, on="postcode", how="left")
arcgis = _active_english_postcode_area(arcgis_raw)
active_postcodes = arcgis.select("postcode").unique()
active_postcode_count = (
active_postcodes.select(pl.len()).collect(engine="streaming").item()
)
wide = _filter_to_active_english_postcodes(wide, active_postcodes)
if listed_buildings_path is not None:
active_postcodes_for_listed = (
@ -1691,92 +1915,25 @@ def _build(
arcgis_path,
epc_path=actual_listings_epc_path,
)
wide = _filter_to_active_english_postcodes(wide, active_postcodes)
wide = wide.with_columns(pl.col(LISTED_BUILDING_FEATURE).fill_null("No"))
arcgis = (
arcgis_raw.filter(pl.col("ctry25cd") == "E92000001") # England only
.filter(pl.col("doterm").is_null()) # Active postcodes only
# NSPL Feb 2026 renamed geographic code columns to {field}{year}cd.
# Alias them back to the short canonical names used across the
# pipeline so downstream joins don't need to know about NSPL's
# versioning scheme.
.select(
pl.col("pcds").alias("postcode"),
"lat",
pl.col("long").alias("lon"),
pl.col("lsoa21cd").alias("lsoa21"),
pl.col("oa21cd").alias("oa21"),
pl.col("pcon24cd").alias("pcon"),
)
)
# NSPL Feb 2026 renamed geographic code columns to {field}{year}cd.
# `_active_english_postcode_area` aliases them back to the short canonical
# names used across the pipeline so downstream joins don't need to know
# about NSPL's versioning scheme.
wide = wide.join(arcgis, on="postcode", how="left")
postcode_area = arcgis
iod = pl.scan_parquet(iod_path).with_columns(
*(_less_deprived_percentile_expr(c) for c in _IOD_PERCENTILE_COLUMNS)
)
wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
ethnicity = pl.scan_parquet(ethnicity_path)
wide = wide.join(
ethnicity,
left_on="Local Authority District code (2024)",
right_on="Geography_code",
how="left",
)
# Derive bedroom count: habitable rooms - 1 (assuming 1 reception room), clipped to 0..4
wide = wide.with_columns(
(pl.col("number_habitable_rooms") - 1)
.clip(0, 4)
.cast(pl.UInt8)
.alias("_bedrooms"),
)
rental = pl.scan_parquet(rental_prices_path).select(
"area_code", "bedrooms", "mean_monthly_rent"
)
wide = wide.join(
rental,
left_on=["Local Authority District code (2024)", "_bedrooms"],
right_on=["area_code", "bedrooms"],
how="left",
)
# Crime is counted spatially per postcode (incidents within 50m of the
# postcode boundary), so it joins on postcode rather than LSOA.
crime = pl.scan_parquet(crime_path)
wide = wide.join(crime, on="postcode", how="left")
wide = wide.with_columns(
pl.sum_horizontal(
"Violence and sexual offences (avg/yr)",
"Robbery (avg/yr)",
"Burglary (avg/yr)",
"Possession of weapons (avg/yr)",
).alias("serious_crime_avg_yr"),
pl.sum_horizontal(
"Anti-social behaviour (avg/yr)",
"Criminal damage and arson (avg/yr)",
"Shoplifting (avg/yr)",
"Bicycle theft (avg/yr)",
"Theft from the person (avg/yr)",
"Other theft (avg/yr)",
"Vehicle crime (avg/yr)",
"Public order (avg/yr)",
"Drugs (avg/yr)",
"Other crime (avg/yr)",
).alias("minor_crime_avg_yr"),
)
median_age = pl.scan_parquet(median_age_path)
wide = wide.join(median_age, on="lsoa21", how="left")
election = pl.scan_parquet(election_results_path)
wide = wide.join(election, on="pcon", how="left")
poi_counts = pl.scan_parquet(poi_proximity_path)
wide = wide.join(poi_counts, on="postcode", how="left")
noise_cols = ["road_noise_lden_db", "rail_noise_lden_db", "airport_noise_lden_db"]
noise = (
pl.scan_parquet(noise_path)
@ -1789,21 +1946,13 @@ def _build(
)
.select("postcode", "noise_lden_db")
)
wide = wide.join(noise, on="postcode", how="left")
school_proximity = pl.scan_parquet(school_proximity_path)
wide = wide.join(school_proximity, on="postcode", how="left")
conservation_areas = _conservation_area_by_postcode(
arcgis.select("postcode", "lat", "lon"), conservation_areas_path
)
wide = wide.join(conservation_areas, on="postcode", how="left").with_columns(
pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
)
tree_density = None
if tree_density_postcodes_path is not None:
tree_density = _tree_density_by_postcode(tree_density_postcodes_path)
wide = wide.join(tree_density, on="postcode", how="left")
# Broadband: derive max available download speed tier per postcode from
# Ofcom availability percentages. Tiers: Gigabit ≥1000, UFBB ≥300,
@ -1828,7 +1977,38 @@ def _build(
.agg(pl.col("max_download_speed").max())
.with_columns(pl.col("max_download_speed").cast(pl.Utf8))
)
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
area_side_tables = {
"iod": iod,
"ethnicity": ethnicity,
"crime": crime,
"median_age": median_age,
"election": election,
"poi_counts": poi_counts,
"noise": noise,
"school_proximity": school_proximity,
"conservation_areas": conservation_areas,
"tree_density": tree_density,
"broadband": broadband,
}
wide = _join_area_side_tables(wide, **area_side_tables)
postcode_area = _join_area_side_tables(postcode_area, **area_side_tables)
# Derive bedroom count: habitable rooms - 1 (assuming 1 reception room), clipped to 0..4
wide = wide.with_columns(
(pl.col("number_habitable_rooms") - 1)
.clip(0, 4)
.cast(pl.UInt8)
.alias("_bedrooms"),
)
rental = pl.scan_parquet(rental_prices_path).select(
"area_code", "bedrooms", "mean_monthly_rent"
)
wide = wide.join(
rental,
left_on=["Local Authority District code (2024)", "_bedrooms"],
right_on=["area_code", "bedrooms"],
how="left",
)
# Derive property_type: prefer EPC data, fall back to price-paid.
# For Houses, use built_form (e.g. Semi-Detached, Mid-Terrace) for finer detail.
@ -1862,112 +2042,40 @@ def _build(
.alias("property_type")
)
wide = (
wide.with_columns(
pl.when(pl.col("duration") == "U")
.then(None)
.otherwise(pl.col("duration"))
.alias("duration"),
pl.when(pl.col("current_energy_rating") == "INVALID!")
.then(None)
.otherwise(pl.col("current_energy_rating"))
.alias("current_energy_rating"),
)
.with_columns(
(pl.col("latest_price") / pl.col("total_floor_area"))
.round(0)
.cast(pl.Int32)
.alias("Price per sqm"),
)
.drop(
"inspection_date",
"_bedrooms",
"LSOA name (2021)",
"Local Authority District code (2024)",
"Local Authority District name (2024)",
"Wider Barriers Sub-domain Score",
"Geographical Barriers Sub-domain Score",
"Adult Skills Sub-domain Score",
"Children and Young People Sub-domain Score",
"Crime Score",
"Living Environment Score",
"Index of Multiple Deprivation (IMD) Score",
"Income Deprivation Affecting Older People (IDAOPI) Score (rate)",
"Income Deprivation Affecting Children Index (IDACI) Score (rate)",
"Barriers to Housing and Services Score",
"oa21",
"pcon",
"epc_property_type",
"pp_property_type",
"built_form",
)
.rename(
{
"date_of_transfer": "Date of last transaction",
"construction_age_band": "Construction year",
"is_construction_date_approximate": "Is construction date approximate",
"Income Score (rate)": "Income Score",
"Employment Score (rate)": "Employment Score",
"Indoors Sub-domain Score": "Housing Conditions Score",
"Outdoors Sub-domain Score": "Air Quality and Road Safety Score",
"pp_address": "Address per Property Register",
"epc_address": "Address per EPC",
"postcode": "Postcode",
"duration": "Leasehold/Freehold",
"current_energy_rating": "Current energy rating",
"potential_energy_rating": "Potential energy rating",
"total_floor_area": "Total floor area (sqm)",
"property_type": "Property type",
"restaurants_2km": "Number of restaurants within 2km",
"groceries_2km": "Number of grocery shops and supermarkets within 2km",
"latest_price": "Last known price",
"number_habitable_rooms": "Number of bedrooms & living rooms",
"noise_lden_db": "Noise (dB)",
"good_primary_5km": "Good+ primary schools within 5km",
"good_secondary_5km": "Good+ secondary schools within 5km",
"good_primary_2km": "Good+ primary schools within 2km",
"good_secondary_2km": "Good+ secondary schools within 2km",
"outstanding_primary_5km": "Outstanding primary schools within 5km",
"outstanding_secondary_5km": "Outstanding secondary schools within 5km",
"outstanding_primary_2km": "Outstanding primary schools within 2km",
"outstanding_secondary_2km": "Outstanding secondary schools within 2km",
"max_download_speed": "Max available download speed (Mbps)",
"serious_crime_avg_yr": "Serious crime (avg/yr)",
"minor_crime_avg_yr": "Minor crime (avg/yr)",
"mean_monthly_rent": "Estimated monthly rent",
"floor_height": "Interior height (m)",
"was_council_house": "Former council house",
"median_age": "Median age",
"turnout_pct": "Voter turnout (%)",
}
)
wide = wide.with_columns(
pl.when(pl.col("duration") == "U")
.then(None)
.otherwise(pl.col("duration"))
.alias("duration"),
pl.when(pl.col("current_energy_rating") == "INVALID!")
.then(None)
.otherwise(pl.col("current_energy_rating"))
.alias("current_energy_rating"),
).with_columns(
(pl.col("latest_price") / pl.col("total_floor_area"))
.round(0)
.cast(pl.Int32)
.alias("Price per sqm"),
)
wide = _finalize_merged_columns(wide)
postcode_area = _finalize_merged_columns(postcode_area)
print("Collecting with streaming engine...")
df = wide.collect(engine="streaming")
if mode == "listings":
df = wide.collect(engine="streaming")
enriched_listings = _finalize_listings(df)
_validate_property_postcodes(enriched_listings)
print(f"Enriched listings rows: {enriched_listings.height}")
return _BuildResult(listings=enriched_listings)
df, postcode_features = pl.collect_all([wide, postcode_area], engine="streaming")
_validate_property_postcodes(df)
# Split into postcode-level and property-level dataframes
area_cols = [
c for c in df.columns if c in _AREA_COLUMNS or _is_dynamic_poi_metric_column(c)
]
postcode_df = df.select(area_cols).group_by("Postcode").first()
postcode_df, properties_df = _split_normal_outputs(
df, postcode_features, expected_postcode_count=active_postcode_count
)
print(f"Postcode rows: {postcode_df.height} (unique postcodes)")
property_cols = [
c
for c in df.columns
if (c not in _AREA_COLUMNS and not _is_dynamic_poi_metric_column(c))
or c == "Postcode"
]
properties_df = df.select(property_cols)
print(f"Property rows: {properties_df.height}")
return _BuildResult(postcode=postcode_df, properties=properties_df)

View file

@ -1,10 +1,12 @@
import json
import shutil
from collections import defaultdict
from pathlib import Path
from pyproj import Transformer
from shapely import make_valid
from shapely.geometry import MultiPolygon, Polygon
from shapely import make_valid, set_precision
from shapely.geometry import MultiPolygon, Polygon, mapping, shape
from shapely.ops import transform as transform_geometry
from shapely.ops import unary_union
from tqdm import tqdm
@ -18,49 +20,47 @@ def _get_to_wgs84():
return _to_wgs84
def _largest_polygonal(geom) -> Polygon | None:
if geom is None or geom.is_empty:
return None
if not geom.is_valid:
geom = make_valid(geom)
if geom.geom_type == "Polygon":
return geom
if geom.geom_type == "MultiPolygon":
return max(geom.geoms, key=lambda g: g.area)
if geom.geom_type == "GeometryCollection":
polygons = [
polygon
for part in geom.geoms
if (polygon := _largest_polygonal(part)) is not None
]
if polygons:
return max(polygons, key=lambda g: g.area)
return None
def to_wgs84_geojson(
geom: Polygon | MultiPolygon, tolerance: float = 1.0
) -> dict | None:
"""Simplify geometry in BNG, convert to WGS84, return GeoJSON dict."""
if geom.is_empty:
geom = _largest_polygonal(geom)
if geom is None:
return None
simplified = geom.simplify(tolerance, preserve_topology=True)
if simplified.is_empty:
simplified = _largest_polygonal(simplified)
if simplified is None:
return None
transformer = _get_to_wgs84()
def transform_ring(coords):
xs, ys = zip(*coords)
lons, lats = transformer.transform(list(xs), list(ys))
return [(round(lon, 6), round(lat, 6)) for lon, lat in zip(lons, lats)]
def transform_polygon(poly):
exterior = transform_ring(poly.exterior.coords)
holes = [transform_ring(h.coords) for h in poly.interiors]
return [exterior] + holes
# Force single Polygon — postcodes are contiguous delivery routes
if simplified.geom_type == "MultiPolygon":
simplified = max(simplified.geoms, key=lambda g: g.area)
elif simplified.geom_type == "GeometryCollection":
polys = [
g for g in simplified.geoms if g.geom_type in ("Polygon", "MultiPolygon")
]
if not polys:
return None
simplified = max(polys, key=lambda g: g.area)
if simplified.geom_type == "MultiPolygon":
simplified = max(simplified.geoms, key=lambda g: g.area)
if simplified.geom_type != "Polygon" or simplified.is_empty:
wgs84 = transform_geometry(transformer.transform, simplified)
wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
wgs84 = _largest_polygonal(wgs84)
if wgs84 is None:
return None
return {
"type": "Polygon",
"coordinates": transform_polygon(simplified),
}
return mapping(wgs84)
def _fill_holes(geom):
@ -132,7 +132,11 @@ def write_district_geojson(
) -> int:
"""Group postcodes by district, write GeoJSON files. Returns file count."""
units_dir = output_dir / "units"
units_dir.mkdir(parents=True, exist_ok=True)
tmp_units_dir = output_dir / "units.tmp"
output_dir.mkdir(parents=True, exist_ok=True)
if tmp_units_dir.exists():
shutil.rmtree(tmp_units_dir)
tmp_units_dir.mkdir(parents=True)
by_district: dict[str, list[tuple[str, Polygon | MultiPolygon]]] = defaultdict(list)
for pc, geom in postcodes.items():
@ -141,14 +145,23 @@ def write_district_geojson(
by_district[district].append((pc, geom))
file_count = 0
seen_postcodes: set[str] = set()
for district, entries in tqdm(
sorted(by_district.items()), desc="Writing GeoJSON", unit="file"
):
features = []
for pc, geom in sorted(entries, key=lambda x: x[0]):
if pc in seen_postcodes:
raise ValueError(f"Duplicate postcode boundary feature: {pc}")
seen_postcodes.add(pc)
geojson_geom = to_wgs84_geojson(geom)
if geojson_geom is None:
continue
raise ValueError(f"Postcode boundary collapsed to empty geometry: {pc}")
written_geom = shape(geojson_geom)
if written_geom.is_empty or not written_geom.is_valid:
raise ValueError(
f"Invalid postcode boundary geometry after output: {pc}"
)
mapit_code = pc.replace(" ", "")
features.append(
{
@ -165,9 +178,12 @@ def write_district_geojson(
continue
collection = {"type": "FeatureCollection", "features": features}
out_path = units_dir / f"{district}.geojson"
out_path = tmp_units_dir / f"{district}.geojson"
with open(out_path, "w") as f:
json.dump(collection, f, separators=(",", ":"))
file_count += 1
if units_dir.exists():
shutil.rmtree(units_dir)
tmp_units_dir.replace(units_dir)
return file_count

View file

@ -3,6 +3,8 @@
Each test targets a specific bug or edge case identified during code review.
"""
import json
import numpy as np
import polars as pl
import pytest
@ -11,7 +13,12 @@ from shapely.ops import unary_union
from .oa_boundaries import parse_gpkg_geometry
from .greenspace import subtract_greenspace
from .output import _fill_holes, merge_fragments, to_wgs84_geojson
from .output import (
_fill_holes,
merge_fragments,
to_wgs84_geojson,
write_district_geojson,
)
from .process_oa import _extract_polygonal, process_oa
from .uprn import get_oa_uprns, load_uprns
from .voronoi import _equal_split_fallback, compute_voronoi_regions
@ -154,6 +161,7 @@ class TestWhitespacePostcodes:
"pcds": ["AA1 1AA", "AA1 1AB"],
"east1m": [500010, 500030],
"north1m": [180010, 180020],
"oa21cd": ["E00000001", "E00000001"],
"doterm": ["2020-01-01", None],
"ctry25cd": ["E92000001", "E92000001"],
}
@ -165,6 +173,65 @@ class TestWhitespacePostcodes:
assert loaded_df["PCDS"].to_list() == ["AA1 1AB"]
def test_arcgis_filters_to_active_english_postcodes(self, tmp_path):
uprns = pl.DataFrame(
{
"GRIDGB1E": [500010, 500020],
"GRIDGB1N": [180010, 180020],
"PCDS": ["AA1 1AA", "CF1 1AA"],
"OA21CD": ["E00000001", "E00000001"],
}
)
uprn_path = tmp_path / "uprn.parquet"
uprns.write_parquet(uprn_path)
arcgis = pl.DataFrame(
{
"pcds": ["AA1 1AA", "CF1 1AA"],
"east1m": [500010, 300010],
"north1m": [180010, 220010],
"oa21cd": ["E00000001", "W00000001"],
"doterm": [None, None],
"ctry25cd": ["E92000001", "W92000004"],
}
)
arcgis_path = tmp_path / "arcgis.parquet"
arcgis.write_parquet(arcgis_path)
loaded_df, _offsets = load_uprns(uprn_path, arcgis_path)
assert loaded_df["PCDS"].to_list() == ["AA1 1AA"]
def test_arcgis_adds_centroid_seed_for_active_postcode_without_uprn(self, tmp_path):
uprns = pl.DataFrame(
{
"GRIDGB1E": [500010],
"GRIDGB1N": [180010],
"PCDS": ["AA1 1AA"],
"OA21CD": ["E00000001"],
}
)
uprn_path = tmp_path / "uprn.parquet"
uprns.write_parquet(uprn_path)
arcgis = pl.DataFrame(
{
"pcds": ["AA1 1AA", "BB1 1BB"],
"east1m": [500010, 510000],
"north1m": [180010, 190000],
"oa21cd": ["E00000001", "E00000002"],
"doterm": [None, None],
"ctry25cd": ["E92000001", "E92000001"],
}
)
arcgis_path = tmp_path / "arcgis.parquet"
arcgis.write_parquet(arcgis_path)
loaded_df, offsets = load_uprns(uprn_path, arcgis_path)
assert set(loaded_df["PCDS"].to_list()) == {"AA1 1AA", "BB1 1BB"}
points, postcodes = get_oa_uprns(loaded_df, offsets, "E00000002")
assert postcodes == ["BB1 1BB"]
assert points.tolist() == [[510000.0, 190000.0]]
# ---------------------------------------------------------------------------
# Bug 3: Voronoi deduplication is first-seen-wins
@ -450,7 +517,9 @@ class TestProcessOAInspireParcelAssignment:
)
postcodes = ["A", "B"]
fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[left, right])
fragments = process_oa(
oa_geom, points, postcodes, inspire_candidates=[left, right]
)
frag_dict = dict(fragments)
assert "A" in frag_dict and "B" in frag_dict
@ -494,7 +563,9 @@ class TestProcessOAInspireParcelAssignment:
)
postcodes = ["A", "B"]
fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[left, right])
fragments = process_oa(
oa_geom, points, postcodes, inspire_candidates=[left, right]
)
frag_dict = dict(fragments)
assert "A" in frag_dict and "B" in frag_dict
@ -539,7 +610,9 @@ class TestProcessOAInspireParcelAssignment:
)
postcodes = ["A", "B"]
fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[straddling])
fragments = process_oa(
oa_geom, points, postcodes, inspire_candidates=[straddling]
)
for _, geom in fragments:
assert geom.difference(oa_geom).area < 0.01
@ -651,6 +724,22 @@ class TestToWgs84Geojson:
assert lon_dp <= 6, f"Longitude {lon_s} has {lon_dp} decimal places"
assert lat_dp <= 6, f"Latitude {lat_s} has {lat_dp} decimal places"
def test_write_district_geojson_replaces_stale_units(self, tmp_path):
stale_units = tmp_path / "units"
stale_units.mkdir()
(stale_units / "ZZ1.geojson").write_text(
json.dumps({"type": "FeatureCollection", "features": []})
)
file_count = write_district_geojson(
{"AA1 1AA": box(530000, 180000, 530100, 180100)}, tmp_path
)
assert file_count == 1
assert not (stale_units / "ZZ1.geojson").exists()
written = json.loads((stale_units / "AA1.geojson").read_text())
assert written["features"][0]["properties"]["postcodes"] == "AA1 1AA"
# ---------------------------------------------------------------------------
# Edge case: parse_gpkg_geometry rejects unknown envelope types

View file

@ -13,6 +13,33 @@ def _canonical_postcode_expr(name: str) -> pl.Expr:
return pl.col(name).str.strip_chars().str.to_uppercase()
def _active_english_arcgis_postcodes(arcgis_path: Path) -> pl.LazyFrame:
return (
pl.read_parquet(
arcgis_path,
columns=["pcds", "east1m", "north1m", "oa21cd", "ctry25cd", "doterm"],
)
.lazy()
.filter(pl.col("ctry25cd") == "E92000001")
.filter(pl.col("doterm").cast(pl.Utf8).is_null())
.select(
_canonical_postcode_expr("pcds").alias("PCDS"),
pl.col("east1m").cast(pl.Float64).alias("GRIDGB1E"),
pl.col("north1m").cast(pl.Float64).alias("GRIDGB1N"),
pl.col("oa21cd").alias("OA21CD"),
)
.filter(
pl.col("PCDS").is_not_null()
& (pl.col("PCDS") != "")
& pl.col("GRIDGB1E").is_not_null()
& pl.col("GRIDGB1N").is_not_null()
& pl.col("OA21CD").is_not_null()
& pl.col("OA21CD").str.starts_with("E")
)
.unique("PCDS")
)
def load_uprns(
uprn_path: Path, arcgis_path: Path | None = None
) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
@ -25,6 +52,7 @@ def load_uprns(
print("Loading UPRN lookup...")
mapping = None
active_postcode_points = None
if arcgis_path is not None:
mapping = (
build_postcode_mapping(arcgis_path)
@ -34,6 +62,7 @@ def load_uprns(
)
.unique("old_postcode")
)
active_postcode_points = _active_english_arcgis_postcodes(arcgis_path)
# Sort via streaming sink to avoid polars doubling memory during in-memory sort
with tempfile.NamedTemporaryFile(
@ -51,11 +80,21 @@ def load_uprns(
if mapping is not None and mapping.height > 0:
uprns = (
uprns.join(mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left")
uprns.join(
mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left"
)
.with_columns(pl.coalesce("new_postcode", "PCDS").alias("PCDS"))
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
)
if active_postcode_points is not None:
active_postcodes = active_postcode_points.select("PCDS").unique()
uprns = uprns.join(active_postcodes, on="PCDS", how="semi")
missing_active = active_postcode_points.join(
uprns.select("PCDS").unique(), on="PCDS", how="anti"
).select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
uprns = pl.concat([uprns, missing_active], how="vertical_relaxed")
uprns.sort("OA21CD").sink_parquet(tmp_path)
release_memory()

View file

@ -0,0 +1,52 @@
import json
from pipeline.transform.crime_hotspot_tiles import _write_geojsonseq
_HEADER = (
"Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,"
"LSOA code,LSOA name,Crime type,Last outcome category,Context"
)
def _row(lon, lat, month, crime_type):
return f",{month},F,F,{lon},{lat},On or near X,E01000001,L,{crime_type},U,"
def _write_csv(path, rows):
path.write_text("\n".join([_HEADER, *rows]) + "\n")
def test_write_geojsonseq_collapses_shared_anchors_into_weighted_features(tmp_path):
csv = tmp_path / "2024-01-test-street.csv"
_write_csv(
csv,
[
# Two incidents snapped to the exact same anchor/month/type -> one
# feature with count=2.
_row(-0.1, 51.5, "2024-01", "Burglary"),
_row(-0.1, 51.5, "2024-01", "Burglary"),
# Same coord, different crime type -> kept separate (per-type filter).
_row(-0.1, 51.5, "2024-01", "Robbery"),
# Out of bounds -> dropped entirely.
_row(-0.1, 80.0, "2024-01", "Burglary"),
# Missing coordinate -> dropped entirely.
_row("", "", "2024-01", "Burglary"),
],
)
out = tmp_path / "hotspots.geojsonseq"
feature_count, incident_count = _write_geojsonseq([csv], out)
features = [json.loads(line) for line in out.read_text().splitlines()]
assert feature_count == 2
assert incident_count == 3 # 2 burglaries + 1 robbery, in-bounds only
by_type = {f["properties"]["crime_type"]: f["properties"] for f in features}
# The busy anchor is a single feature carrying its full incident weight,
# so tippecanoe's density thinning can no longer silently erase it.
assert by_type["Burglary"]["count"] == 2
assert by_type["Burglary"]["weight"] == 2
assert by_type["Robbery"]["count"] == 1
# Geometry preserved as [lon, lat].
assert by_type["Burglary"]["count"] == 2
assert all(f["geometry"]["coordinates"] == [-0.1, 51.5] for f in features)

View file

@ -1,9 +1,13 @@
import json
import numpy as np
import polars as pl
import pytest
import shapely
from pyproj import Transformer
from pipeline.transform.crime_spatial import transform_crime_spatial
from pipeline.transform.postcode_boundaries.loader import load_postcode_polygons
_TO_WGS84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
@ -82,7 +86,10 @@ def test_buffer_overlap_counts_for_each_postcode(tmp_path):
output = tmp_path / "crime_by_postcode.parquet"
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
transform_crime_spatial(crime, units, output, by_year)
# Pin the 50m buffer the geometry above was designed around (the production
# default is now 100m). The three squares are equal-area, so area
# normalisation leaves the counts unchanged.
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
rows = {
r["postcode"]: r
@ -127,7 +134,7 @@ def test_by_year_annualises_and_rolls_up(tmp_path):
output = tmp_path / "crime_by_postcode.parquet"
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
transform_crime_spatial(crime, units, output, by_year)
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
by_year_df = pl.read_parquet(by_year)
assert by_year_df.height == 1
@ -145,3 +152,130 @@ def test_by_year_annualises_and_rolls_up(tmp_path):
# 2023 serious = Burglary(12) + Robbery(12) = 24; 2024 = Burglary(12).
assert serious[2023] == 24.0
assert serious[2024] == 12.0
def test_area_normalisation_divides_out_buffered_catchment(tmp_path):
# Three postcodes of increasing footprint, each with exactly one incident in
# its buffer. Normalisation rescales by median_catchment / buffered_area, so
# the smallest scores highest and the median-sized one is unchanged -- i.e.
# the metric is a density. Dividing by the *buffered* catchment (not the raw
# polygon) means the fixed buffer-ring floor keeps the spread gentle, so the
# tiniest postcode is not blown up out of proportion.
units = tmp_path / "units"
_write_boundaries(
units,
{
"AB1": [
_square_feature("AB1 1AA", 1000, 1000, 1010, 1010), # 10x10
_square_feature("AB1 1AB", 3000, 3000, 3010, 3020), # 10x20 (median)
_square_feature("AB1 1AC", 5000, 5000, 5020, 5020), # 20x20
]
},
)
crime = tmp_path / "crime"
_write_month(
crime,
"2024-01",
[
_crime_row("2024-01", 1005, 1005, "Burglary"),
_crime_row("2024-01", 3005, 3010, "Burglary"),
_crime_row("2024-01", 5010, 5010, "Burglary"),
],
)
output = tmp_path / "crime_by_postcode.parquet"
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
# Re-derive the expected values from the same buffered catchment areas: each
# postcode is 12/yr before normalisation, then x (median_buf / buffered_area).
postcodes, polygons = load_postcode_polygons(units)
buf_area = {
pc: float(shapely.area(shapely.buffer(poly, 50.0, quad_segs=8)))
for pc, poly in zip(postcodes, polygons)
}
median_buf = float(np.median(list(buf_area.values())))
expected = {pc: 12.0 * median_buf / buf_area[pc] for pc in buf_area}
rows = {r["postcode"]: r for r in pl.read_parquet(output).to_dicts()}
for pc, exp in expected.items():
assert rows[pc]["Burglary (avg/yr)"] == pytest.approx(exp, abs=0.1)
# Median catchment unchanged; ordering is by inverse buffered area, but the
# buffer-ring floor keeps the spread far below the ~4x raw-area ratio.
assert rows["AB1 1AB"]["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)
small = rows["AB1 1AA"]["Burglary (avg/yr)"]
big = rows["AB1 1AC"]["Burglary (avg/yr)"]
assert small > 12.0 > big
assert small / big < 1.5
# by-year series carries the same normalisation.
by_year_df = pl.read_parquet(by_year)
small_row = by_year_df.filter(pl.col("postcode") == "AB1 1AA").row(0, named=True)
assert small_row["Burglary (by year)"] == [
{"year": 2024, "count": pytest.approx(expected["AB1 1AA"], abs=0.1)}
]
def test_avg_yr_is_simple_mean_of_year_bars(tmp_path):
# Uneven month coverage across years: 2023 has 1 month (2 incidents -> 24/yr),
# 2024 has 2 months (2 incidents -> 12/yr). The headline must be the *simple*
# mean of the bars (24+12)/2 = 18, not the month-weighted pooled rate
# (4 incidents / 3 months * 12 = 16).
units = tmp_path / "units"
_write_boundaries(
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
)
crime = tmp_path / "crime"
_write_month(
crime,
"2023-01",
[
_crime_row("2023-01", 1005, 1005, "Burglary"),
_crime_row("2023-01", 1005, 1005, "Burglary"),
],
)
_write_month(crime, "2024-01", [_crime_row("2024-01", 1005, 1005, "Burglary")])
_write_month(crime, "2024-02", [_crime_row("2024-02", 1005, 1005, "Burglary")])
output = tmp_path / "crime_by_postcode.parquet"
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
avg = pl.read_parquet(output).row(0, named=True)
assert avg["Burglary (avg/yr)"] == pytest.approx(18.0, abs=0.05)
row = pl.read_parquet(by_year).row(0, named=True)
bars = {p["year"]: p["count"] for p in row["Burglary (by year)"]}
assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)}
def test_unknown_crime_type_is_dropped_with_warning(tmp_path, capsys):
units = tmp_path / "units"
_write_boundaries(
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
)
crime = tmp_path / "crime"
_write_month(
crime,
"2024-01",
[
_crime_row("2024-01", 1005, 1005, "Burglary"),
_crime_row("2024-01", 1005, 1005, "Cyber fraud"),
],
)
output = tmp_path / "crime_by_postcode.parquet"
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
columns = pl.read_parquet(output).columns
# The unknown type is dropped (no column for it) but a warning is emitted.
assert "Cyber fraud (avg/yr)" not in columns
assert "Burglary (avg/yr)" in columns
err = capsys.readouterr().err
assert "Cyber fraud" in err
assert "WARNING" in err

View file

@ -10,8 +10,10 @@ from pipeline.transform.merge import (
LISTED_BUILDING_FEATURE,
TREE_DENSITY_FEATURE,
_LISTING_OVERLAY_SOURCES,
_active_english_postcode_area,
_build_unmatched_listing_seed_rows,
_canonical_postcode_expr,
_filter_to_active_english_postcodes,
_finalize_listings,
_integrate_listings,
_match_direct_epc,
@ -24,8 +26,11 @@ from pipeline.transform.merge import (
_matched_listed_building_flags,
_postcode_conservation_area_flags,
_postcode_listed_building_candidates,
_remap_terminated_postcodes,
_split_normal_outputs,
_tree_density_by_postcode,
_validate_lad_source_coverage,
_validate_postcode_feature_output,
_validate_property_postcodes,
)
@ -79,6 +84,113 @@ def test_crime_columns_are_spatial_counts_not_per_capita() -> None:
assert "Minor crime per 1k residents (avg/yr)" not in _AREA_COLUMNS
def test_active_english_postcode_area_filters_to_active_england() -> None:
arcgis = pl.DataFrame(
{
"pcds": ["AA1 1AA", "AA1 1AB", "CF1 1AA"],
"ctry25cd": ["E92000001", "E92000001", "W92000004"],
"doterm": [None, "2020-01-01", None],
"lat": [51.0, 51.1, 52.0],
"long": [-0.1, -0.2, -3.0],
"lsoa21cd": ["L1", "L2", "L3"],
"oa21cd": ["O1", "O2", "O3"],
"pcon24cd": ["P1", "P2", "P3"],
}
)
result = _active_english_postcode_area(arcgis.lazy()).collect()
assert result.to_dicts() == [
{
"postcode": "AA1 1AA",
"lat": 51.0,
"lon": -0.1,
"ctry25cd": "E92000001",
"lsoa21": "L1",
"oa21": "O1",
"pcon": "P1",
}
]
def test_remap_then_active_filter_keeps_terminated_english_properties() -> None:
wide = pl.DataFrame(
{
"postcode": ["OLD 1AA", "NEW 1AA", "CF1 1AA"],
"row_id": [1, 2, 3],
}
).lazy()
mapping = pl.DataFrame(
{"old_postcode": ["OLD 1AA"], "new_postcode": ["NEW 1AA"]}
).lazy()
active_postcodes = pl.DataFrame({"postcode": ["NEW 1AA"]}).lazy()
result = (
_filter_to_active_english_postcodes(
_remap_terminated_postcodes(wide, mapping), active_postcodes
)
.collect()
.sort("row_id")
)
assert result.to_dicts() == [
{"postcode": "NEW 1AA", "row_id": 1},
{"postcode": "NEW 1AA", "row_id": 2},
]
def test_split_normal_outputs_uses_postcode_feature_universe() -> None:
df = pl.DataFrame(
{
"Postcode": ["AA1 1AA"],
"Address per Property Register": ["1 Example Road"],
"Last known price": [250_000],
"lat": [51.0],
"lon": [-0.1],
"ctry25cd": ["E92000001"],
"lsoa21": ["L1"],
}
)
postcode_features = pl.DataFrame(
{
"Postcode": ["AA1 1AA", "BB1 1BB"],
"lat": [51.0, 52.0],
"lon": [-0.1, -0.2],
"ctry25cd": ["E92000001", "E92000001"],
"lsoa21": ["L1", "L2"],
"Distance to nearest amenity (Park) (km)": [0.3, 0.8],
}
)
postcode_df, properties_df = _split_normal_outputs(
df, postcode_features, expected_postcode_count=2
)
assert postcode_df["Postcode"].to_list() == ["AA1 1AA", "BB1 1BB"]
assert "Distance to nearest amenity (Park) (km)" in postcode_df.columns
assert properties_df.to_dicts() == [
{
"Postcode": "AA1 1AA",
"Address per Property Register": "1 Example Road",
"Last known price": 250_000,
}
]
def test_postcode_feature_validation_rejects_unsupported_or_ungeocoded_rows() -> None:
postcode_df = pl.DataFrame(
{
"Postcode": ["AA1 1AA", "CF1 1AA"],
"lat": [51.0, None],
"lon": [-0.1, None],
"ctry25cd": ["E92000001", "W92000004"],
}
)
with pytest.raises(ValueError, match="unsupported or ungeocoded"):
_validate_postcode_feature_output(postcode_df, expected_postcode_count=2)
def test_listed_building_feature_is_property_level() -> None:
assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS
@ -418,9 +530,7 @@ def test_build_unmatched_listing_seed_rows_fills_property_shape_fields(
)
unmatched_idxs = listings.select("_listing_idx")
seed = _build_unmatched_listing_seed_rows(
unmatched_idxs, listings, template_schema
)
seed = _build_unmatched_listing_seed_rows(unmatched_idxs, listings, template_schema)
assert seed.height == 1
assert seed["postcode"].to_list() == ["SW1A 1AA"]
@ -550,7 +660,12 @@ def test_match_direct_epc_matches_by_uprn_across_postcodes() -> None:
[{"_listing_uprn": "100000000001", "_listing_match_postcode": "ZZ99ZZ"}]
),
_direct_epc_candidates(
[{"_direct_epc_uprn": "100000000001", "_direct_epc_match_postcode": "AA11AA"}]
[
{
"_direct_epc_uprn": "100000000001",
"_direct_epc_match_postcode": "AA11AA",
}
]
),
)

View file

@ -1,81 +1,105 @@
import math
import zipfile
from pathlib import Path
import numpy as np
import polars as pl
import pyogrio
import pytest
import shapely
from pipeline.transform.tree_density import (
STREET_TREE_COVERAGE_COL,
STREET_TREE_DENSITY_COL,
_add_nfi_batch,
_accumulate_clipped_area,
_coverage_percentile_expr,
_finalize_metrics,
_geometry_column,
_layers,
_metric_columns,
_nfi_dataset_path,
_postcode_buffers,
_postcode_density_percentile_col,
_safe_extract_zip_dir,
_with_postcode_density_percentiles,
_write_street_rollups,
)
def test_nfi_accumulation_adds_only_clipped_overlap_area() -> None:
def test_accumulate_clipped_area_adds_only_in_buffer_overlap() -> None:
radius_m = 50
points = pl.DataFrame({"postcode": ["A", "B"], "x": [0.0, 1000.0], "y": [0.0, 0.0]})
circles, tree = _postcode_buffers(points, radius_m)
buffer_area = math.pi * radius_m * radius_m
# A large woodland square centred on postcode A fully covers A's circle.
# A large square centred on postcode A fully covers A's buffer circle.
canopy_area = np.zeros(2)
feature_count = np.zeros(2, dtype=np.uint32)
big = shapely.box(-500, -500, 500, 500) # 1,000,000 sqm parcel
_add_nfi_batch(
np.array([big], dtype=object),
np.array(["Woodland"], dtype=object),
circles,
tree,
canopy_area,
feature_count,
radius_m,
)
_accumulate_clipped_area(np.array([big], dtype=object), circles, tree, canopy_area)
# Only the clipped circle area is added (the 32-gon buffer approximates the
# circle to ~1%), NOT the full 1,000,000 sqm polygon.
assert canopy_area[0] == pytest.approx(buffer_area, rel=1e-2)
assert canopy_area[0] <= buffer_area # never exceeds the buffer area
assert canopy_area[0] <= buffer_area # never exceeds the true buffer area
assert canopy_area[1] == 0.0 # postcode B is 1km away, no overlap
assert feature_count.tolist() == [1, 0]
# A large parcel that only slivers into B's circle must add only the sliver,
# not its full area -- the failure mode the old centroid path could not avoid.
# not its full area -- the failure mode a centroid/full-area path could not avoid.
canopy_area = np.zeros(2)
feature_count = np.zeros(2, dtype=np.uint32)
sliver = shapely.box(1040, -500, 2000, 500) # left edge 10m inside B's circle
_add_nfi_batch(
np.array([sliver], dtype=object),
np.array(["Woodland"], dtype=object),
circles,
tree,
canopy_area,
feature_count,
radius_m,
_accumulate_clipped_area(
np.array([sliver], dtype=object), circles, tree, canopy_area
)
assert canopy_area[0] == 0.0
assert 0.0 < canopy_area[1] < buffer_area # tiny segment, far below 1M sqm
# Non-woodland categories contribute nothing.
canopy_area = np.zeros(2)
feature_count = np.zeros(2, dtype=np.uint32)
_add_nfi_batch(
np.array([big], dtype=object),
np.array(["Non woodland"], dtype=object),
def test_accumulate_clipped_area_drops_missing_and_empty_geometry() -> None:
radius_m = 50
points = pl.DataFrame({"postcode": ["A"], "x": [0.0], "y": [0.0]})
circles, tree = _postcode_buffers(points, radius_m)
canopy_area = np.zeros(1)
geoms = np.array(
[None, shapely.from_wkt("POLYGON EMPTY"), shapely.box(-10, -10, 10, 10)],
dtype=object,
)
# A None and an empty geometry must be skipped, not crash, and only the real
# 400 sqm box is accumulated (it is fully inside the buffer).
_accumulate_clipped_area(geoms, circles, tree, canopy_area)
assert canopy_area[0] == pytest.approx(400.0)
def test_accumulate_clipped_area_height_weighted_by_overlap() -> None:
radius_m = 50
points = pl.DataFrame({"postcode": ["A"], "x": [0.0], "y": [0.0]})
circles, tree = _postcode_buffers(points, radius_m)
canopy_area = np.zeros(1)
height_weighted_sum = np.zeros(1)
height_weight = np.zeros(1)
geoms = np.array(
[
shapely.box(-10, -10, 0, 0), # 100 sqm, fully inside
shapely.box(0, 0, 20, 20), # 400 sqm, fully inside
shapely.box(-5, 0, 0, 5), # 25 sqm, NaN height -> ignored for height
],
dtype=object,
)
height = np.array([5.0, 10.0, np.nan])
_accumulate_clipped_area(
geoms,
circles,
tree,
canopy_area,
feature_count,
radius_m,
height=height,
height_weighted_sum=height_weighted_sum,
height_weight=height_weight,
)
assert canopy_area.tolist() == [0.0, 0.0]
assert feature_count.tolist() == [0, 0]
# All three clipped areas count toward canopy; only the finite-height ones
# contribute to the area-weighted mean height.
assert canopy_area[0] == pytest.approx(525.0)
assert height_weight[0] == pytest.approx(500.0)
mean_height = height_weighted_sum[0] / height_weight[0]
assert mean_height == pytest.approx((5.0 * 100 + 10.0 * 400) / 500) # 9.0
def test_coverage_percentile_expr_ranks_higher_coverage_higher() -> None:
@ -88,76 +112,142 @@ def test_coverage_percentile_expr_ranks_higher_coverage_higher() -> None:
assert result["percentile"].to_list() == [0.0, 50.0, 100.0, None]
def test_coverage_percentile_expr_uses_exact_scale_endpoints() -> None:
def test_coverage_percentile_expr_uses_tie_consistent_average_rank() -> None:
# Tied extremes share their mean rank instead of being pinned to exact 0/100,
# so the whole scale runs on one consistent average-rank formula.
df = pl.DataFrame({"coverage": [0.0, 0.0, 5.0, 10.0, 10.0]})
result = df.lazy().with_columns(
_coverage_percentile_expr("coverage", "percentile")
).collect()
assert result["percentile"].to_list() == [0.0, 0.0, 50.0, 100.0, 100.0]
assert result["percentile"].to_list() == [12.5, 12.5, 50.0, 87.5, 87.5]
def test_street_rollup_percentiles_are_ranked_over_raw_street_coverage(
tmp_path: Path,
) -> None:
def test_coverage_percentile_expr_all_equal_is_neutral_midpoint() -> None:
all_equal = pl.DataFrame({"coverage": [5.0, 5.0, 5.0]})
single = pl.DataFrame({"coverage": [7.0]})
with_null = pl.DataFrame({"coverage": [None, 5.0, 5.0, 5.0]})
def percentiles(df: pl.DataFrame) -> list:
return (
df.lazy()
.with_columns(_coverage_percentile_expr("coverage", "percentile"))
.collect()["percentile"]
.to_list()
)
assert percentiles(all_equal) == [50.0, 50.0, 50.0]
assert percentiles(single) == [50.0]
assert percentiles(with_null) == [None, 50.0, 50.0, 50.0]
def test_finalize_metrics_caps_density_keeps_raw_area_and_weights_height() -> None:
radius_m = 50
density_col, area_col, count_col, height_col = _metric_columns(radius_m)
buffer_area = math.pi * radius_m * radius_m
density_col, area_col, height_col = _metric_columns(radius_m)
points = pl.DataFrame({"postcode": ["AA1 1AA", "AA1 1AB", "AA1 1AC"]})
canopy_area = np.array([0.0, buffer_area * 0.5, buffer_area * 2.0])
# Postcode 0: no height samples -> null. Postcode 1: area-weighted mean = 5.
height_weighted_sum = np.array([0.0, 500.0, 0.0])
height_weight = np.array([0.0, 100.0, 0.0])
metrics = _finalize_metrics(
points, canopy_area, height_weighted_sum, height_weight, radius_m
)
assert metrics[density_col].to_list() == [0.0, 50.0, 100.0] # capped at 100
# area_col is the raw clipped accumulation, intentionally uncapped.
assert metrics[area_col].to_list() == pytest.approx(
[0.0, round(buffer_area * 0.5, 1), round(buffer_area * 2.0, 1)]
)
assert metrics[height_col].to_list() == [None, 5.0, None]
# The mixed-unit feature-count column has been removed entirely.
assert "Tree features within 50m" not in metrics.columns
assert set(metrics.columns) == {"postcode", density_col, area_col, height_col}
def test_postcode_density_percentiles_rank_over_density() -> None:
radius_m = 50
density_col, area_col, height_col = _metric_columns(radius_m)
percentile_col = _postcode_density_percentile_col(radius_m)
postcode_metrics = _with_postcode_density_percentiles(
metrics = _with_postcode_density_percentiles(
pl.DataFrame(
{
"postcode": ["AA1 1AA", "AA1 1AB", "AA1 1AC"],
density_col: [10.0, 30.0, 50.0],
area_col: [100.0, 300.0, 500.0],
count_col: [1, 3, 5],
height_col: [4.0, 6.0, 8.0],
}
),
radius_m,
)
price_paid = pl.DataFrame(
{
"postcode": ["AA1 1AA", "AA1 1AA", "AA1 1AB", "AA1 1AC"],
"paon": ["1", "2", "3", "4"],
"saon": ["", "", "", ""],
"street": ["Oak Road", "Oak Road", "Oak Road", "Elm Street"],
"locality": ["", "", "", ""],
"town_city": ["Test Town", "Test Town", "Test Town", "Test Town"],
"district": ["Test District"] * 4,
"county": ["Test County"] * 4,
"date_of_transfer": [
"2024-01-01",
"2024-01-02",
"2024-01-03",
"2024-01-04",
],
}
assert percentile_col in metrics.columns
assert metrics[percentile_col].to_list() == [0.0, 50.0, 100.0]
def test_safe_extract_zip_dir_rejects_path_traversal(tmp_path: Path) -> None:
malicious = tmp_path / "evil.zip"
with zipfile.ZipFile(malicious, "w") as archive:
archive.writestr("../escape.txt", "pwned")
with pytest.raises(ValueError, match="Unsafe path"):
_safe_extract_zip_dir(malicious, tmp_path / "extract", force=True)
def test_safe_extract_zip_dir_extracts_benign_archive(tmp_path: Path) -> None:
benign = tmp_path / "ok.zip"
with zipfile.ZipFile(benign, "w") as archive:
archive.writestr("data/x.txt", "hello")
extract_dir = tmp_path / "extract"
result = _safe_extract_zip_dir(benign, extract_dir, force=True)
assert result == extract_dir
assert (extract_dir / "data" / "x.txt").read_text() == "hello"
def test_geometry_column_resolution() -> None:
assert _geometry_column({"geometry_name": "SHAPE"}, ["MEANHT", "SHAPE"]) == "SHAPE"
assert _geometry_column({}, ["a", "wkb_geometry", "b"]) == "wkb_geometry"
assert _geometry_column({"geometry_name": None}, ["x", "geom"]) == "geom"
assert _geometry_column({}, ["a", "b", "c"]) == "c" # last-column fallback
def _zip_with_shapefiles(zip_path: Path, names: list[str]) -> None:
with zipfile.ZipFile(zip_path, "w") as archive:
for name in names:
archive.writestr(name, "")
def test_nfi_dataset_path_requires_exactly_one_shapefile(tmp_path: Path) -> None:
multi = tmp_path / "multi.zip"
_zip_with_shapefiles(multi, ["a.shp", "b.shp"])
with pytest.raises(ValueError, match="exactly one shapefile"):
_nfi_dataset_path(multi, tmp_path / "multi_x", force_extract=True, use_vsizip=False)
none = tmp_path / "none.zip"
_zip_with_shapefiles(none, ["readme.txt"])
with pytest.raises(FileNotFoundError):
_nfi_dataset_path(none, tmp_path / "none_x", force_extract=True, use_vsizip=False)
one = tmp_path / "one.zip"
_zip_with_shapefiles(one, ["woodland.shp", "woodland.dbf"])
resolved = _nfi_dataset_path(
one, tmp_path / "one_x", force_extract=True, use_vsizip=False
)
price_paid_path = tmp_path / "price-paid.parquet"
output_streets = tmp_path / "streets.parquet"
output_addresses = tmp_path / "addresses.parquet"
price_paid.write_parquet(price_paid_path)
assert resolved.endswith("woodland.shp")
_write_street_rollups(
postcode_metrics=postcode_metrics,
price_paid_path=price_paid_path,
output_streets=output_streets,
output_addresses=output_addresses,
radius_m=radius_m,
def test_layers_selection_and_unknown(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(
pyogrio,
"list_layers",
lambda _path: [("L1", "Polygon"), ("L2", "Polygon")],
)
streets = pl.read_parquet(output_streets).sort("street")
addresses = pl.read_parquet(output_addresses)
assert streets["street"].to_list() == ["Elm Street", "Oak Road"]
assert streets[STREET_TREE_COVERAGE_COL].to_list() == pytest.approx([50.0, 16.7])
assert streets.select("street", STREET_TREE_DENSITY_COL).rows() == [
("Elm Street", 100.0),
("Oak Road", 0.0),
]
assert percentile_col in addresses.columns
assert STREET_TREE_COVERAGE_COL in addresses.columns
assert STREET_TREE_DENSITY_COL in addresses.columns
assert _layers("ignored", None) == ["L1", "L2"]
assert _layers("ignored", ("L2",)) == ["L2"]
with pytest.raises(ValueError, match="Unknown TOW layer"):
_layers("ignored", ("L3",))

View file

@ -1,16 +1,28 @@
"""Derive street-scale tree density metrics from Forest Research TOW + NFI data.
"""Derive postcode-scale tree density metrics from Forest Research TOW + NFI data.
The Forest Research Trees Outside Woodland release is an Esri File Geodatabase
inside property-data/FR_TOW_V1_ALL.zip. This transformer computes a compact
postcode-level metric from the tree polygons, then optionally rolls that up to
Price Paid street names so the dashboard can answer "what is this address's
street like?" without loading the full geodatabase at runtime.
postcode-level metric from the tree polygons so the dashboard can answer "how
green is this postcode?" without loading the full geodatabase at runtime.
TOW only covers trees *outside* woodland, so the National Forest Inventory (NFI)
woodland layer is optionally unioned in. TOW canopy is accumulated by centroid
proximity (tiny crowns), while large NFI woodland parcels are accumulated by
true buffer-clipped intersection area so they cannot saturate a postcode from
mere centroid proximity.
Every postcode centroid is expanded into a radius-r buffer ("extended area").
Both TOW tree crowns and National Forest Inventory (NFI) woodland parcels are
accumulated by *true buffer-clipped intersection area*: only the part of each
polygon that falls inside a postcode's buffer is counted, never the area that
spills outside it. A crown straddling the buffer edge therefore contributes only
its inside portion, and a parcel reaching into the buffer from outside is still
counted -- no polygon can saturate a postcode from mere proximity.
TOW only covers trees *outside* woodland, so the NFI woodland layer is the
geometric complement of TOW and is optionally unioned in. The two products are
*assumed disjoint*: clipped TOW crown area and clipped NFI woodland area are
summed into the same per-postcode accumulator, so any spatial overlap between a
TOW crown and an NFI parcel (boundary slop where "groups of trees" meet
"woodland") would be double-counted. The final density is capped at 100% and
_finalize_metrics logs how many postcodes exceed 100% raw coverage, which is a
direct symptom of such overlap (or of overlapping crowns within one buffer); if
that count is material the products are not disjoint and the NFI clip should be
taken against the complement of TOW.
"""
from __future__ import annotations
@ -25,16 +37,12 @@ import numpy as np
import polars as pl
import pyogrio
import shapely
from scipy.spatial import cKDTree
TOW_GDB_NAME = "FR_TOW_V1_ALL.gdb"
STREET_TREE_DENSITY_COL = "Street tree density percentile"
STREET_TREE_COVERAGE_COL = "Street tree coverage (%)"
POSTCODE_DENSITY_COL = "Tree canopy density within {radius}m (%)"
POSTCODE_DENSITY_PERCENTILE_COL = "Tree canopy density percentile within {radius}m"
POSTCODE_AREA_COL = "Tree canopy area within {radius}m (sqm)"
POSTCODE_COUNT_COL = "Tree features within {radius}m"
POSTCODE_HEIGHT_COL = "Mean TOW height within {radius}m (m)"
# National Forest Inventory (NFI) woodland — the geometric complement of TOW.
@ -131,13 +139,24 @@ def _safe_extract_zip_dir(zip_path: Path, extract_dir: Path, force: bool) -> Pat
def _nfi_dataset_path(
zip_path: Path, extract_dir: Path, force_extract: bool, use_vsizip: bool
) -> str:
"""Resolve the NFI woodland shapefile path, extracting the zip if needed."""
"""Resolve the NFI woodland shapefile path, extracting the zip if needed.
Raises if the archive contains zero or more than one shapefile rather than
silently picking one, so an ambiguous NFI release fails loudly instead of
accumulating canopy from the wrong layer.
"""
if use_vsizip:
return f"/vsizip/{zip_path.resolve()}"
extracted = _safe_extract_zip_dir(zip_path, extract_dir, force_extract)
shapefiles = sorted(extracted.rglob("*.shp"))
if not shapefiles:
raise FileNotFoundError(f"No .shp found inside {zip_path}")
if len(shapefiles) > 1:
names = ", ".join(path.name for path in shapefiles)
raise ValueError(
f"Expected exactly one shapefile inside {zip_path}, found {len(shapefiles)} "
f"({names}); cannot unambiguously pick the NFI woodland layer"
)
return str(shapefiles[0])
@ -146,7 +165,7 @@ def _geometry_column(metadata: dict, column_names: list[str]) -> str:
geometry_name = metadata.get("geometry_name")
if geometry_name:
return str(geometry_name)
for name in ("wkb_geometry", "geometry", "geom"):
for name in ("wkb_geometry", "geometry", "geom", "SHAPE"):
if name in column_names:
return name
return column_names[-1]
@ -184,11 +203,10 @@ def _layers(dataset_path: str, selected_layers: tuple[str, ...] | None) -> list[
return [layer for layer in available if layer in selected_layers]
def _metric_columns(radius_m: int) -> tuple[str, str, str, str]:
def _metric_columns(radius_m: int) -> tuple[str, str, str]:
return (
POSTCODE_DENSITY_COL.format(radius=radius_m),
POSTCODE_AREA_COL.format(radius=radius_m),
POSTCODE_COUNT_COL.format(radius=radius_m),
POSTCODE_HEIGHT_COL.format(radius=radius_m),
)
@ -198,20 +216,23 @@ def _postcode_density_percentile_col(radius_m: int) -> str:
def _coverage_percentile_expr(column: str, alias: str) -> pl.Expr:
"""Rank higher tree coverage higher on a 0-100 England-wide percentile scale."""
"""Rank tree coverage on a 0-100 England-wide percentile scale.
A single tie-consistent average-rank formula is used for every value so the
scale is internally consistent end to end: tied values share their mean rank,
so the lowest coverage maps toward 0 and the highest toward 100 only when they
are not themselves tied. An all-equal (or single-value) column has no spread
and maps to the neutral midpoint (50).
"""
value = pl.col(column).fill_nan(None)
non_null_count = value.count()
rank = value.rank("average")
return (
pl.when(value.is_null())
.then(None)
.when(value == value.min())
.then(0.0)
.when(value == value.max())
.then(100.0)
.when(non_null_count > 1)
.then(((rank - 1) / (non_null_count - 1) * 100).round(1))
.otherwise(100.0)
.otherwise(50.0)
.cast(pl.Float32)
.alias(alias)
)
@ -220,7 +241,7 @@ def _coverage_percentile_expr(column: str, alias: str) -> pl.Expr:
def _with_postcode_density_percentiles(
postcode_metrics: pl.DataFrame, radius_m: int
) -> pl.DataFrame:
density_col, _area_col, _count_col, _height_col = _metric_columns(radius_m)
density_col, _area_col, _height_col = _metric_columns(radius_m)
return postcode_metrics.with_columns(
_coverage_percentile_expr(
density_col,
@ -229,28 +250,88 @@ def _with_postcode_density_percentiles(
)
def _accumulate_tree_metrics(
def _postcode_buffers(
points: pl.DataFrame, radius_m: int
) -> tuple[np.ndarray, shapely.STRtree]:
"""Build a radius-r circle for every postcode plus an STRtree over them.
Circle index == postcode index, so an STRtree match resolves directly to the
postcode accumulator slot.
"""
xy = points.select("x", "y").to_numpy()
circles = shapely.buffer(shapely.points(xy), radius_m, quad_segs=8)
return circles, shapely.STRtree(circles)
def _accumulate_clipped_area(
geoms: np.ndarray,
circles: np.ndarray,
tree: shapely.STRtree,
canopy_area: np.ndarray,
height: np.ndarray | None = None,
height_weighted_sum: np.ndarray | None = None,
height_weight: np.ndarray | None = None,
) -> None:
"""Add each polygon's in-buffer overlap area to every postcode it intersects.
Only area(polygon circle) is accumulated -- never the area of the polygon
that falls outside the postcode's extended buffer -- so a crown straddling
the buffer edge contributes only its inside portion and a large parcel cannot
saturate a postcode from mere proximity. When ``height`` is supplied the mean
feature height is accumulated weighted by that same clipped overlap area.
"""
keep = ~shapely.is_missing(geoms) & ~shapely.is_empty(geoms)
geoms = geoms[keep]
if height is not None:
height = height[keep]
if geoms.size == 0:
return
# query(predicate="intersects") over the circle STRtree returns exactly the
# (polygon, circle) pairs whose clipped overlap can be positive -- i.e. the
# polygon overlaps that postcode's radius-r buffer.
geom_index, postcode_index = tree.query(geoms, predicate="intersects")
if geom_index.size == 0:
return
clipped_area = shapely.area(
shapely.intersection(geoms[geom_index], circles[postcode_index])
)
positive = clipped_area > 0
geom_index = geom_index[positive]
postcode_index = postcode_index[positive]
clipped_area = clipped_area[positive]
np.add.at(canopy_area, postcode_index, clipped_area)
if height is not None:
feature_height = height[geom_index]
finite = np.isfinite(feature_height)
if finite.any():
np.add.at(
height_weighted_sum,
postcode_index[finite],
feature_height[finite] * clipped_area[finite],
)
np.add.at(height_weight, postcode_index[finite], clipped_area[finite])
def _accumulate_tow_metrics(
dataset_path: str,
points: pl.DataFrame,
radius_m: int,
circles: np.ndarray,
tree: shapely.STRtree,
canopy_area: np.ndarray,
height_weighted_sum: np.ndarray,
height_weight: np.ndarray,
batch_size: int,
layer_names: tuple[str, ...] | None,
max_features_per_layer: int | None,
workers: int,
canopy_area: np.ndarray,
feature_count: np.ndarray,
height_weighted_sum: np.ndarray,
height_weight: np.ndarray,
) -> None:
xy = points.select("x", "y").to_numpy()
tree = cKDTree(xy)
layers = _layers(dataset_path, layer_names)
print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")
columns = ["Woodland_Type", "TOW_Area_M", "MEANHT"]
columns = ["MEANHT"]
total_features_seen = 0
total_features_used = 0
for layer in layers:
info = pyogrio.read_info(dataset_path, layer=layer)
@ -263,7 +344,7 @@ def _accumulate_tree_metrics(
columns=columns,
batch_size=batch_size,
use_pyarrow=True,
) as (_meta, reader):
) as (meta, reader):
for batch_index, batch in enumerate(reader, start=1):
if max_features_per_layer is not None:
remaining = max_features_per_layer - layer_features_seen
@ -275,135 +356,29 @@ def _accumulate_tree_metrics(
layer_features_seen += batch.num_rows
total_features_seen += batch.num_rows
names = batch.schema.names
area = np.asarray(
batch.column(names.index("TOW_Area_M")).to_numpy(zero_copy_only=False),
dtype=np.float64,
)
geometry_column = _geometry_column(meta, names)
height = np.asarray(
batch.column(names.index("MEANHT")).to_numpy(zero_copy_only=False),
dtype=np.float64,
)
geometry = np.asarray(
batch.column(names.index("SHAPE")).to_numpy(zero_copy_only=False),
batch.column(names.index(geometry_column)).to_numpy(
zero_copy_only=False
),
dtype=object,
)
valid = np.isfinite(area) & (area > 0)
if not valid.any():
continue
geometry = geometry[valid]
area = area[valid]
height = height[valid]
centroids = shapely.centroid(shapely.from_wkb(geometry))
x = shapely.get_x(centroids)
y = shapely.get_y(centroids)
valid_xy = np.isfinite(x) & np.isfinite(y)
if not valid_xy.any():
continue
x = x[valid_xy]
y = y[valid_xy]
area = area[valid_xy]
height = height[valid_xy]
nearby = tree.query_ball_point(
np.column_stack((x, y)), radius_m, workers=workers
_accumulate_clipped_area(
shapely.from_wkb(geometry),
circles,
tree,
canopy_area,
height=height,
height_weighted_sum=height_weighted_sum,
height_weight=height_weight,
)
lengths = np.fromiter(
(len(postcode_indexes) for postcode_indexes in nearby),
dtype=np.int32,
count=len(nearby),
)
matching_features = lengths > 0
if matching_features.any():
postcode_indexes = np.concatenate(
[indexes for indexes in nearby if indexes]
).astype(np.int64, copy=False)
feature_indexes = np.repeat(
np.flatnonzero(matching_features), lengths[matching_features]
)
np.add.at(canopy_area, postcode_indexes, area[feature_indexes])
np.add.at(feature_count, postcode_indexes, 1)
feature_height = height[feature_indexes]
valid_height = np.isfinite(feature_height)
if valid_height.any():
height_area = area[feature_indexes][valid_height]
np.add.at(
height_weighted_sum,
postcode_indexes[valid_height],
feature_height[valid_height] * height_area,
)
np.add.at(
height_weight,
postcode_indexes[valid_height],
height_area,
)
total_features_used += len(area)
if batch_index == 1 or batch_index % 25 == 0:
print(
f" batch {batch_index:,}: "
f"{total_features_seen:,} rows read, "
f"{total_features_used:,} features with usable centroids"
)
def _postcode_buffers(
points: pl.DataFrame, radius_m: int
) -> tuple[np.ndarray, shapely.STRtree]:
"""Build a radius-r circle for every postcode plus an STRtree over them.
Circle index == postcode index, matching the order used by the cKDTree path.
"""
xy = points.select("x", "y").to_numpy()
circles = shapely.buffer(shapely.points(xy), radius_m, quad_segs=8)
return circles, shapely.STRtree(circles)
def _add_nfi_batch(
geoms: np.ndarray,
category: np.ndarray,
circles: np.ndarray,
tree: shapely.STRtree,
canopy_area: np.ndarray,
feature_count: np.ndarray,
radius_m: int,
) -> None:
"""Add NFI woodland into the shared arrays by true buffer-clipped area.
Unlike the TOW centroid path, this clips each woodland polygon to each
nearby postcode circle and adds only area(polygon circle); a large parcel
therefore cannot saturate a postcode from mere centroid proximity, and a
buffer-filling parcel whose centroid is outside the radius is not missed.
"""
keep = (category == NFI_WOODLAND_VALUE) & ~shapely.is_missing(geoms)
geoms = geoms[keep]
if geoms.size:
geoms = geoms[~shapely.is_empty(geoms)]
if geoms.size == 0:
return
# dwithin(polygon, point, r) is true iff the radius-r circle around the
# point intersects the polygon -- exactly the candidate set we want.
nfi_index, postcode_index = tree.query(
geoms, predicate="dwithin", distance=radius_m
)
if nfi_index.size == 0:
return
clipped_area = shapely.area(
shapely.intersection(geoms[nfi_index], circles[postcode_index])
)
positive = clipped_area > 0
postcode_index = postcode_index[positive]
clipped_area = clipped_area[positive]
np.add.at(canopy_area, postcode_index, clipped_area)
np.add.at(feature_count, postcode_index, 1)
print(f" batch {batch_index:,}: {total_features_seen:,} rows read")
def _accumulate_nfi_metrics(
@ -411,8 +386,6 @@ def _accumulate_nfi_metrics(
circles: np.ndarray,
tree: shapely.STRtree,
canopy_area: np.ndarray,
feature_count: np.ndarray,
radius_m: int,
batch_size: int,
max_nfi_features: int | None,
) -> None:
@ -455,14 +428,12 @@ def _accumulate_nfi_metrics(
),
dtype=object,
)
_add_nfi_batch(
shapely.from_wkb(geometry),
category,
geoms = shapely.from_wkb(geometry)
_accumulate_clipped_area(
geoms[category == NFI_WOODLAND_VALUE],
circles,
tree,
canopy_area,
feature_count,
radius_m,
)
if batch_index == 1 or batch_index % 25 == 0:
print(f" NFI batch {batch_index:,}: {features_seen:,} rows read")
@ -471,15 +442,26 @@ def _accumulate_nfi_metrics(
def _finalize_metrics(
points: pl.DataFrame,
canopy_area: np.ndarray,
feature_count: np.ndarray,
height_weighted_sum: np.ndarray,
height_weight: np.ndarray,
radius_m: int,
) -> pl.DataFrame:
n_points = points.height
density_col, area_col, count_col, height_col = _metric_columns(radius_m)
density_col, area_col, height_col = _metric_columns(radius_m)
buffer_area = math.pi * radius_m * radius_m
density_pct = np.minimum(canopy_area / buffer_area * 100.0, 100.0)
raw_density = canopy_area / buffer_area * 100.0
density_pct = np.minimum(raw_density, 100.0)
# Symptom of the assumed-disjoint TOW/NFI union being violated (or of
# overlapping crowns inside one buffer): clipped areas alone cannot exceed the
# buffer unless polygons overlap. Surface it rather than hide it behind the cap.
over_count = int(np.count_nonzero(raw_density > 100.0))
if over_count:
print(
f" note: {over_count:,} postcode(s) exceeded 100% raw canopy and were "
"capped — indicates overlapping TOW/NFI canopy within the buffer"
)
mean_height = np.divide(
height_weighted_sum,
height_weight,
@ -492,7 +474,6 @@ def _finalize_metrics(
"postcode": points["postcode"],
area_col: canopy_area.round(1).astype(np.float32),
density_col: density_pct.round(1).astype(np.float32),
count_col: feature_count.astype(np.uint32),
height_col: np.round(mean_height, 1).astype(np.float32),
}
).with_columns(
@ -500,181 +481,9 @@ def _finalize_metrics(
)
def _clean_key_expr(column: str) -> pl.Expr:
return (
pl.col(column)
.fill_null("")
.str.to_uppercase()
.str.replace_all(r"[^A-Z0-9]+", " ")
.str.replace_all(r"\s+", " ")
.str.strip_chars()
)
def _latest_price_paid_addresses(price_paid_path: Path) -> pl.LazyFrame:
return (
pl.scan_parquet(price_paid_path)
.select(
pl.col("postcode").str.strip_chars().str.to_uppercase().alias("postcode"),
"paon",
"saon",
"street",
"locality",
"town_city",
"district",
"county",
"date_of_transfer",
)
.filter(pl.col("postcode").is_not_null())
.filter(pl.col("street").is_not_null())
.filter(_clean_key_expr("street") != "")
.with_columns(
pl.concat_str(
[pl.col("saon"), pl.col("paon"), pl.col("street")],
separator=" ",
ignore_nulls=True,
)
.str.replace_all(r"\s+", " ")
.str.strip_chars()
.alias("pp_address"),
)
.filter(pl.col("pp_address").is_not_null())
.sort("date_of_transfer")
.group_by("postcode", "pp_address", maintain_order=True)
.agg(
pl.col("street").last(),
pl.col("locality").last(),
pl.col("town_city").last(),
pl.col("district").last(),
pl.col("county").last(),
)
.with_columns(
pl.concat_str(
[
_clean_key_expr("street"),
_clean_key_expr("town_city"),
_clean_key_expr("district"),
_clean_key_expr("county"),
],
separator="|",
).alias("street_key")
)
)
def _weighted_mean_expr(column: str, weight: str) -> pl.Expr:
valid = pl.col(column).is_not_null() & ~pl.col(column).is_nan()
numerator = pl.when(valid).then(pl.col(column) * pl.col(weight)).sum()
denominator = pl.when(valid).then(pl.col(weight)).sum()
return pl.when(denominator > 0).then(numerator / denominator).otherwise(None)
def _write_street_rollups(
postcode_metrics: pl.DataFrame,
price_paid_path: Path,
output_streets: Path | None,
output_addresses: Path | None,
radius_m: int,
) -> None:
if output_streets is None and output_addresses is None:
return
density_col, area_col, count_col, height_col = _metric_columns(radius_m)
metrics = postcode_metrics.lazy()
addresses = _latest_price_paid_addresses(price_paid_path).join(
metrics, on="postcode", how="inner"
)
per_postcode = (
addresses.group_by(
"street_key",
"postcode",
"street",
"locality",
"town_city",
"district",
"county",
)
.agg(
pl.len().alias("address_count"),
pl.col(density_col).first(),
pl.col(area_col).first(),
pl.col(count_col).first(),
pl.col(height_col).first(),
)
.collect()
)
streets = (
per_postcode.lazy()
.group_by("street_key")
.agg(
pl.col("street").first(),
pl.col("locality").first(),
pl.col("town_city").first(),
pl.col("district").first(),
pl.col("county").first(),
pl.col("postcode").n_unique().alias("postcode_count"),
pl.col("address_count").sum().alias("address_count"),
_weighted_mean_expr(density_col, "address_count")
.round(1)
.cast(pl.Float32)
.alias(STREET_TREE_COVERAGE_COL),
_weighted_mean_expr(area_col, "address_count")
.round(1)
.cast(pl.Float32)
.alias(f"Street average {area_col}"),
_weighted_mean_expr(count_col, "address_count")
.round(1)
.cast(pl.Float32)
.alias(f"Street average {count_col}"),
_weighted_mean_expr(height_col, "address_count")
.round(1)
.cast(pl.Float32)
.alias(f"Street average {height_col}"),
)
.with_columns(
_coverage_percentile_expr(
STREET_TREE_COVERAGE_COL,
STREET_TREE_DENSITY_COL,
)
)
.sort("street_key")
.collect()
)
if output_addresses is not None:
output_addresses.parent.mkdir(parents=True, exist_ok=True)
address_output = addresses.join(
streets.lazy().select(
"street_key",
STREET_TREE_COVERAGE_COL,
STREET_TREE_DENSITY_COL,
),
on="street_key",
how="left",
)
address_output.sink_parquet(output_addresses, compression="zstd")
print(f"Wrote address tree-density join: {output_addresses}")
if output_streets is not None:
output_streets.parent.mkdir(parents=True, exist_ok=True)
streets.write_parquet(output_streets, compression="zstd")
print(f"Wrote street tree-density rollup: {output_streets}")
def _parse_csv_arg(value: str | None) -> tuple[str, ...] | None:
if value is None:
return None
if value.lower() == "all":
return None
parts = tuple(part.strip() for part in value.split(",") if part.strip())
return parts or None
def main() -> None:
parser = argparse.ArgumentParser(
description="Build postcode and street tree-density metrics from FR_TOW_V1_ALL.zip"
description="Build postcode-level tree-density metrics from FR_TOW_V1_ALL.zip"
)
parser.add_argument(
"--tow-zip",
@ -716,35 +525,17 @@ def main() -> None:
default=Path("property-data/arcgis_data.parquet"),
help="Postcode centroid parquet with east1m/north1m columns",
)
parser.add_argument(
"--price-paid",
type=Path,
default=None,
help="Optional Price Paid parquet used to roll postcode metrics up to streets",
)
parser.add_argument(
"--output-postcodes",
type=Path,
required=True,
help="Output postcode-level tree-density parquet",
)
parser.add_argument(
"--output-streets",
type=Path,
default=None,
help="Optional output street-level tree-density parquet",
)
parser.add_argument(
"--output-addresses",
type=Path,
default=None,
help="Optional output address/street join parquet keyed by postcode and pp_address",
)
parser.add_argument(
"--radius-m",
type=int,
default=50,
help="Radius around each postcode centroid used as the street-scale buffer",
help="Radius around each postcode centroid used as the extended buffer",
)
parser.add_argument(
"--layers",
@ -757,12 +548,6 @@ def main() -> None:
default=65_536,
help="Arrow batch size for reading TOW features",
)
parser.add_argument(
"--workers",
type=int,
default=-1,
help="Worker count passed to scipy cKDTree.query_ball_point",
)
parser.add_argument(
"--max-postcodes",
type=int,
@ -783,9 +568,6 @@ def main() -> None:
)
args = parser.parse_args()
if (args.output_streets or args.output_addresses) and args.price_paid is None:
raise SystemExit("--price-paid is required when writing street/address outputs")
if args.radius_m <= 0:
raise SystemExit("--radius-m must be greater than zero")
@ -797,36 +579,32 @@ def main() -> None:
n_points = points.height
canopy_area = np.zeros(n_points, dtype=np.float64)
feature_count = np.zeros(n_points, dtype=np.uint32)
height_weighted_sum = np.zeros(n_points, dtype=np.float64)
height_weight = np.zeros(n_points, dtype=np.float64)
_accumulate_tree_metrics(
circles, tree = _postcode_buffers(points, args.radius_m)
_accumulate_tow_metrics(
dataset_path=dataset_path,
points=points,
radius_m=args.radius_m,
circles=circles,
tree=tree,
canopy_area=canopy_area,
height_weighted_sum=height_weighted_sum,
height_weight=height_weight,
batch_size=args.batch_size,
layer_names=layer_names,
max_features_per_layer=args.max_features_per_layer,
workers=args.workers,
canopy_area=canopy_area,
feature_count=feature_count,
height_weighted_sum=height_weighted_sum,
height_weight=height_weight,
)
if args.nfi_zip is not None and args.nfi_zip.exists():
nfi_path = _nfi_dataset_path(
args.nfi_zip, args.nfi_extract_dir, args.force_extract, args.use_vsizip
)
circles, nfi_tree = _postcode_buffers(points, args.radius_m)
_accumulate_nfi_metrics(
dataset_path=nfi_path,
circles=circles,
tree=nfi_tree,
tree=tree,
canopy_area=canopy_area,
feature_count=feature_count,
radius_m=args.radius_m,
batch_size=args.batch_size,
max_nfi_features=args.max_nfi_features,
)
@ -836,7 +614,6 @@ def main() -> None:
postcode_metrics = _finalize_metrics(
points,
canopy_area,
feature_count,
height_weighted_sum,
height_weight,
args.radius_m,
@ -849,14 +626,14 @@ def main() -> None:
postcode_metrics.write_parquet(args.output_postcodes, compression="zstd")
print(f"\nWrote postcode tree-density metrics: {args.output_postcodes}")
if args.price_paid is not None:
_write_street_rollups(
postcode_metrics=postcode_metrics,
price_paid_path=args.price_paid,
output_streets=args.output_streets,
output_addresses=args.output_addresses,
radius_m=args.radius_m,
)
def _parse_csv_arg(value: str | None) -> tuple[str, ...] | None:
if value is None:
return None
if value.lower() == "all":
return None
parts = tuple(part.strip() for part in value.split(",") if part.strip())
return parts or None
if __name__ == "__main__":

View file

@ -90,7 +90,7 @@ def _write_tree_geojsonseq(
columns=columns,
batch_size=batch_size,
use_pyarrow=True,
) as (_meta, reader):
) as (meta, reader):
for batch in reader:
if max_features_per_layer is not None:
remaining = max_features_per_layer - layer_features_seen
@ -101,6 +101,7 @@ def _write_tree_geojsonseq(
layer_features_seen += batch.num_rows
names = batch.schema.names
geometry_column = _geometry_column(meta, names)
area = np.asarray(
batch.column(names.index("TOW_Area_M")).to_numpy(
zero_copy_only=False
@ -108,7 +109,7 @@ def _write_tree_geojsonseq(
dtype=np.float64,
)
geometry = np.asarray(
batch.column(names.index("SHAPE")).to_numpy(
batch.column(names.index(geometry_column)).to_numpy(
zero_copy_only=False
),
dtype=object,
@ -327,7 +328,7 @@ def build_tree_overlay_tiles(
str(min_zoom),
"--maximum-zoom",
str(max_zoom),
"--drop-smallest-as-needed",
"--coalesce-smallest-as-needed",
"--extend-zooms-if-still-dropping",
"--temporary-directory",
tmp,

View file

@ -13,7 +13,11 @@ def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
Uses OS National Grid coordinates (east1m, north1m) which are Cartesian metres,
so Euclidean distance via cKDTree gives accurate results without projection.
"""
arcgis = pl.scan_parquet(arcgis_path).filter(pl.col("ctry25cd") == "E92000001")
arcgis = (
pl.scan_parquet(arcgis_path)
.filter(pl.col("ctry25cd") == "E92000001")
.with_columns(pl.col("doterm").cast(pl.Utf8).alias("doterm"))
)
active = (
arcgis.filter(pl.col("doterm").is_null())

View file

@ -9,6 +9,8 @@ import zipfile
from pathlib import Path
import polars as pl
from shapely.geometry import shape
from shapely.validation import explain_validity
def _failures_for_file(path: Path) -> list[str]:
@ -79,9 +81,7 @@ def _split_glob(spec: str) -> tuple[Path, str]:
def _split_pair(spec: str, label: str) -> tuple[Path, Path]:
if "::" not in spec:
raise argparse.ArgumentTypeError(
f"{spec!r} must use LEFT::RIGHT for {label}"
)
raise argparse.ArgumentTypeError(f"{spec!r} must use LEFT::RIGHT for {label}")
left, right = spec.split("::", 1)
if not left or not right:
raise argparse.ArgumentTypeError(f"{spec!r} must include both paths")
@ -143,22 +143,140 @@ def _parquet_postcodes(path: Path) -> set[str]:
.get_column(column)
.to_list()
)
return {_canonical_postcode(value) for value in values if _canonical_postcode(value)}
return {
_canonical_postcode(value) for value in values if _canonical_postcode(value)
}
def _active_english_arcgis_postcodes(path: Path) -> set[str]:
schema = pl.scan_parquet(path).collect_schema()
required = {"pcds", "ctry25cd", "doterm"}
missing = sorted(required - set(schema.names()))
if missing:
raise ValueError(f"{path}: missing ArcGIS postcode columns: {missing}")
values = (
pl.read_parquet(path, columns=["pcds", "ctry25cd", "doterm"])
.lazy()
.filter(pl.col("ctry25cd") == "E92000001")
.filter(pl.col("doterm").cast(pl.Utf8).is_null())
.select(pl.col("pcds").drop_nulls().unique())
.collect()
.get_column("pcds")
.to_list()
)
return {
_canonical_postcode(value) for value in values if _canonical_postcode(value)
}
def _format_samples(samples: list[str]) -> str:
return "; ".join(samples[:10])
def _boundary_postcode_scan(path: Path) -> tuple[set[str], list[str]]:
units_dir = path / "units" if (path / "units").is_dir() else path
postcodes: set[str] = set()
seen: dict[str, str] = {}
failures: list[str] = []
missing_postcode_samples: list[str] = []
missing_geometry_samples: list[str] = []
non_polygon_samples: list[str] = []
invalid_geometry_samples: list[str] = []
duplicate_samples: list[str] = []
missing_postcode_count = 0
missing_geometry_count = 0
non_polygon_count = 0
invalid_geometry_count = 0
duplicate_count = 0
for geojson_path in sorted(units_dir.glob("*.geojson")):
try:
with geojson_path.open("r", encoding="utf-8") as handle:
data = json.load(handle)
except Exception as exc:
failures.append(f"{geojson_path}: unreadable GeoJSON: {exc}")
continue
for idx, feature in enumerate(data.get("features", [])):
label = f"{geojson_path.name} feature {idx}"
properties = feature.get("properties") or {}
value = properties.get("postcodes")
postcode = _canonical_postcode(value) if value is not None else ""
if not postcode:
missing_postcode_count += 1
if len(missing_postcode_samples) < 10:
missing_postcode_samples.append(label)
else:
if postcode in seen:
duplicate_count += 1
if len(duplicate_samples) < 10:
duplicate_samples.append(
f"{postcode} in {seen[postcode]} and {label}"
)
else:
seen[postcode] = label
postcodes.add(postcode)
geometry_data = feature.get("geometry")
if geometry_data is None:
missing_geometry_count += 1
if len(missing_geometry_samples) < 10:
missing_geometry_samples.append(f"{postcode or label}")
continue
try:
geom = shape(geometry_data)
except Exception as exc:
invalid_geometry_count += 1
if len(invalid_geometry_samples) < 10:
invalid_geometry_samples.append(f"{postcode or label}: {exc}")
continue
if geom.is_empty:
missing_geometry_count += 1
if len(missing_geometry_samples) < 10:
missing_geometry_samples.append(f"{postcode or label}: empty")
elif geom.geom_type not in {"Polygon", "MultiPolygon"}:
non_polygon_count += 1
if len(non_polygon_samples) < 10:
non_polygon_samples.append(f"{postcode or label}: {geom.geom_type}")
elif not geom.is_valid:
invalid_geometry_count += 1
if len(invalid_geometry_samples) < 10:
invalid_geometry_samples.append(
f"{postcode or label}: {explain_validity(geom)}"
)
if missing_postcode_count:
failures.append(
f"{path}: {missing_postcode_count:,} boundary features are missing "
f"properties.postcodes; sample: {_format_samples(missing_postcode_samples)}"
)
if duplicate_count:
failures.append(
f"{path}: {duplicate_count:,} duplicate boundary postcode features; "
f"sample: {_format_samples(duplicate_samples)}"
)
if missing_geometry_count:
failures.append(
f"{path}: {missing_geometry_count:,} boundary features are missing or empty "
f"geometry; sample: {_format_samples(missing_geometry_samples)}"
)
if non_polygon_count:
failures.append(
f"{path}: {non_polygon_count:,} boundary features are not polygonal; "
f"sample: {_format_samples(non_polygon_samples)}"
)
if invalid_geometry_count:
failures.append(
f"{path}: {invalid_geometry_count:,} invalid boundary geometries; "
f"sample: {_format_samples(invalid_geometry_samples)}"
)
return postcodes, failures
def _boundary_postcodes(path: Path) -> set[str]:
units_dir = path / "units" if (path / "units").is_dir() else path
postcodes: set[str] = set()
for geojson_path in sorted(units_dir.glob("*.geojson")):
with geojson_path.open("r", encoding="utf-8") as handle:
data = json.load(handle)
for feature in data.get("features", []):
properties = feature.get("properties") or {}
value = properties.get("postcodes")
if value is not None:
postcode = _canonical_postcode(value)
if postcode:
postcodes.add(postcode)
postcodes, failures = _boundary_postcode_scan(path)
if failures:
raise ValueError("; ".join(failures))
return postcodes
@ -174,11 +292,13 @@ def _failures_for_postcode_boundary_match(spec: str) -> list[str]:
try:
parquet_postcodes = _parquet_postcodes(parquet_path)
boundary_postcodes = _boundary_postcodes(boundaries_path)
boundary_postcodes, boundary_failures = _boundary_postcode_scan(boundaries_path)
except Exception as exc:
return [f"{parquet_path} / {boundaries_path}: postcode match check failed: {exc}"]
return [
f"{parquet_path} / {boundaries_path}: postcode match check failed: {exc}"
]
failures = []
failures = list(boundary_failures)
if not boundary_postcodes:
failures.append(f"{boundaries_path}: no boundary postcodes found")
@ -197,6 +317,41 @@ def _failures_for_postcode_boundary_match(spec: str) -> list[str]:
return failures
def _failures_for_active_postcode_boundary_match(spec: str) -> list[str]:
arcgis_path, boundaries_path = _split_pair(
spec, "active postcode boundary matching"
)
failures = _failures_for_parquet(arcgis_path) + _failures_for_dir(boundaries_path)
if failures:
return failures
try:
active_postcodes = _active_english_arcgis_postcodes(arcgis_path)
boundary_postcodes, boundary_failures = _boundary_postcode_scan(boundaries_path)
except Exception as exc:
return [
f"{arcgis_path} / {boundaries_path}: active postcode boundary check failed: {exc}"
]
failures = list(boundary_failures)
if not boundary_postcodes:
failures.append(f"{boundaries_path}: no boundary postcodes found")
missing_boundaries = active_postcodes - boundary_postcodes
orphan_boundaries = boundary_postcodes - active_postcodes
if missing_boundaries:
failures.append(
f"{boundaries_path}: {len(missing_boundaries):,} active English postcodes "
f"from {arcgis_path} are missing boundaries; sample: {_sample(missing_boundaries)}"
)
if orphan_boundaries:
failures.append(
f"{boundaries_path}: {len(orphan_boundaries):,} boundary postcodes are not "
f"active English postcodes in {arcgis_path}; sample: {_sample(orphan_boundaries)}"
)
return failures
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--file", action="append", default=[], type=Path)
@ -221,6 +376,15 @@ def main() -> int:
default=[],
help="Require postcode parquet keys to exactly match boundary GeoJSON postcodes: PARQUET::DIR",
)
parser.add_argument(
"--active-postcode-boundary-match",
action="append",
default=[],
help=(
"Require active English ArcGIS postcodes to exactly match boundary "
"GeoJSON postcodes: ARCGIS_PARQUET::DIR"
),
)
args = parser.parse_args()
failures: list[str] = []
@ -238,6 +402,8 @@ def main() -> int:
failures.extend(_failures_for_zip_glob(spec))
for spec in args.postcode_boundary_match:
failures.extend(_failures_for_postcode_boundary_match(spec))
for spec in args.active_postcode_boundary_match:
failures.extend(_failures_for_active_postcode_boundary_match(spec))
if failures:
print("Output validation failed:", file=sys.stderr)

View file

@ -282,17 +282,23 @@ pub fn compute_crime_by_year(
for &row in matching_rows {
let postcode = data.postcode(row);
let Some(series_list) = crime_by_year.series_by_postcode.get(postcode) else {
continue;
};
// For every type the postcode reports, add its per-year counts.
// For types it doesn't report, treat the row as contributing 0 — so we
// bump the row count for *every* known type below.
for series in series_list {
let acc = &mut per_type_year_sums[series.type_idx as usize];
for point in &series.points {
*acc.entry(point.year).or_insert(0.0) += point.count as f64;
// A postcode absent from the by-year table has no recorded crime within
// 50m, so it contributes 0 to every type's per-year sum. It must still be
// counted in the denominator: the matching `(avg/yr)` stat counts those
// same zero-crime postcodes as 0.0 (crime_by_postcode.parquet has a dense
// row for every boundary postcode), so excluding them here would compute
// the chart over a smaller population and report a higher magnitude than
// the headline. Property postcodes are guaranteed to be boundary
// postcodes by the postcode-boundary-match validation, so "absent" means
// genuinely zero-crime, not missing data.
if let Some(series_list) = crime_by_year.series_by_postcode.get(postcode) {
// For every type the postcode reports, add its per-year counts.
for series in series_list {
let acc = &mut per_type_year_sums[series.type_idx as usize];
for point in &series.points {
*acc.entry(point.year).or_insert(0.0) += point.count as f64;
}
}
}
for c in per_type_row_counts.iter_mut() {