idk
This commit is contained in:
parent
a04ac2d857
commit
d43da9708c
47 changed files with 4120 additions and 573 deletions
|
|
@ -4,7 +4,11 @@ import type { MapFlyToOptions, PostcodeGeometry } from '../../types';
|
||||||
import { authHeaders, isAbortError } from '../../lib/api';
|
import { authHeaders, isAbortError } from '../../lib/api';
|
||||||
import { POSTCODE_SEARCH_ZOOM } from '../../lib/consts';
|
import { POSTCODE_SEARCH_ZOOM } from '../../lib/consts';
|
||||||
import { useIsMobile } from '../../hooks/useIsMobile';
|
import { useIsMobile } from '../../hooks/useIsMobile';
|
||||||
import { useLocationSearch, type SearchResult } from '../../hooks/useLocationSearch';
|
import {
|
||||||
|
useLocationSearch,
|
||||||
|
type SearchResult,
|
||||||
|
type ViewportCenter,
|
||||||
|
} from '../../hooks/useLocationSearch';
|
||||||
import { PlaceSearchInput } from '../ui/PlaceSearchInput';
|
import { PlaceSearchInput } from '../ui/PlaceSearchInput';
|
||||||
import { LocateIcon } from '../ui/icons/LocateIcon';
|
import { LocateIcon } from '../ui/icons/LocateIcon';
|
||||||
import { SearchIcon } from '../ui/icons/SearchIcon';
|
import { SearchIcon } from '../ui/icons/SearchIcon';
|
||||||
|
|
@ -44,6 +48,12 @@ const ZOOM_FOR_TYPE: Record<string, number> = {
|
||||||
locality: 14,
|
locality: 14,
|
||||||
hamlet: 15,
|
hamlet: 15,
|
||||||
isolated_dwelling: 16,
|
isolated_dwelling: 16,
|
||||||
|
street: 16,
|
||||||
|
university: 15,
|
||||||
|
park: 15,
|
||||||
|
attraction: 16,
|
||||||
|
hospital: 16,
|
||||||
|
retail: 15,
|
||||||
};
|
};
|
||||||
|
|
||||||
const DEV_CURRENT_LOCATION = {
|
const DEV_CURRENT_LOCATION = {
|
||||||
|
|
@ -56,6 +66,7 @@ export default function LocationSearch({
|
||||||
onLocationSearched,
|
onLocationSearched,
|
||||||
onCurrentLocationFound,
|
onCurrentLocationFound,
|
||||||
onMouseEnter,
|
onMouseEnter,
|
||||||
|
getViewportCenter,
|
||||||
className = '',
|
className = '',
|
||||||
inputClassName,
|
inputClassName,
|
||||||
}: {
|
}: {
|
||||||
|
|
@ -63,11 +74,13 @@ export default function LocationSearch({
|
||||||
onLocationSearched?: (postcode: SearchedLocation | null) => void;
|
onLocationSearched?: (postcode: SearchedLocation | null) => void;
|
||||||
onCurrentLocationFound?: (lat: number, lng: number) => void;
|
onCurrentLocationFound?: (lat: number, lng: number) => void;
|
||||||
onMouseEnter?: () => void;
|
onMouseEnter?: () => void;
|
||||||
|
/** Returns the current map centre so search ranking can bias toward the visible area. */
|
||||||
|
getViewportCenter?: () => ViewportCenter | null;
|
||||||
className?: string;
|
className?: string;
|
||||||
inputClassName?: string;
|
inputClassName?: string;
|
||||||
}) {
|
}) {
|
||||||
const { t } = useTranslation();
|
const { t } = useTranslation();
|
||||||
const search = useLocationSearch();
|
const search = useLocationSearch(undefined, getViewportCenter);
|
||||||
const [error, setError] = useState<string | null>(null);
|
const [error, setError] = useState<string | null>(null);
|
||||||
const [loading, setLoading] = useState(false);
|
const [loading, setLoading] = useState(false);
|
||||||
const [expanded, setExpanded] = useState(false);
|
const [expanded, setExpanded] = useState(false);
|
||||||
|
|
|
||||||
|
|
@ -154,7 +154,100 @@ function searchResultKey(result: SearchResult): string {
|
||||||
return `place:${result.slug}`;
|
return `place:${result.slug}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function useLocationSearch(mode?: string) {
|
/** A category-tagged, scored result from the unified `/api/places` ranking. */
|
||||||
|
type UnifiedResultDTO =
|
||||||
|
| { type: 'postcode'; label: string; score: number }
|
||||||
|
| { type: 'address'; address: string; postcode: string; lat: number; lon: number; score: number }
|
||||||
|
| {
|
||||||
|
type: 'place';
|
||||||
|
name: string;
|
||||||
|
slug: string;
|
||||||
|
place_type: string;
|
||||||
|
lat: number;
|
||||||
|
lon: number;
|
||||||
|
city?: string;
|
||||||
|
score: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
interface PlacesApiResponse {
|
||||||
|
places: PlaceResult[];
|
||||||
|
postcodes?: string[];
|
||||||
|
addresses?: AddressResult[];
|
||||||
|
/** Preferred: a single relevance-ordered list across all categories. */
|
||||||
|
results?: UnifiedResultDTO[];
|
||||||
|
}
|
||||||
|
|
||||||
|
function isNonNull<T>(value: T | null): value is T {
|
||||||
|
return value !== null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function unifiedToSearchResult(result: UnifiedResultDTO): SearchResult | null {
|
||||||
|
if (result.type === 'postcode') {
|
||||||
|
return { type: 'postcode', label: result.label };
|
||||||
|
}
|
||||||
|
if (result.type === 'address') {
|
||||||
|
return {
|
||||||
|
type: 'address',
|
||||||
|
address: result.address,
|
||||||
|
postcode: result.postcode,
|
||||||
|
lat: result.lat,
|
||||||
|
lon: result.lon,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if (result.type === 'place') {
|
||||||
|
return {
|
||||||
|
type: 'place',
|
||||||
|
name: result.name,
|
||||||
|
slug: result.slug,
|
||||||
|
place_type: result.place_type,
|
||||||
|
lat: result.lat,
|
||||||
|
lon: result.lon,
|
||||||
|
city: result.city === 'City of London' ? 'London' : result.city,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Legacy ordering for servers that predate the unified `results` list: positional buckets,
|
||||||
|
* re-filtered locally. Retained only as a fallback. */
|
||||||
|
function legacyCombineResults(json: PlacesApiResponse, trimmed: string): SearchResult[] {
|
||||||
|
const placeResults = json.places.map((p) => ({
|
||||||
|
type: 'place' as const,
|
||||||
|
name: p.name,
|
||||||
|
slug: p.slug,
|
||||||
|
place_type: p.place_type,
|
||||||
|
lat: p.lat,
|
||||||
|
lon: p.lon,
|
||||||
|
city: p.city === 'City of London' ? 'London' : p.city,
|
||||||
|
}));
|
||||||
|
const outcodeResults = placeResults.filter((result) => result.place_type === 'outcode');
|
||||||
|
const otherPlaceResults = placeResults.filter((result) => result.place_type !== 'outcode');
|
||||||
|
const postcodeResults: SearchResult[] = (json.postcodes ?? []).map((postcode) => ({
|
||||||
|
type: 'postcode' as const,
|
||||||
|
label: postcode,
|
||||||
|
}));
|
||||||
|
const addressResults: SearchResult[] = (json.addresses ?? []).map((address) => ({
|
||||||
|
type: 'address' as const,
|
||||||
|
address: address.address,
|
||||||
|
postcode: address.postcode,
|
||||||
|
lat: address.lat,
|
||||||
|
lon: address.lon,
|
||||||
|
}));
|
||||||
|
const containsHouseNumber = /\d/.test(trimmed);
|
||||||
|
return filterResultsForQuery(
|
||||||
|
containsHouseNumber
|
||||||
|
? [...outcodeResults, ...postcodeResults, ...addressResults, ...otherPlaceResults]
|
||||||
|
: [...outcodeResults, ...postcodeResults, ...otherPlaceResults, ...addressResults],
|
||||||
|
trimmed
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
export type ViewportCenter = { lat: number; lng: number };
|
||||||
|
|
||||||
|
export function useLocationSearch(
|
||||||
|
mode?: string,
|
||||||
|
getViewportCenter?: () => ViewportCenter | null
|
||||||
|
) {
|
||||||
const [query, setQuery] = useState('');
|
const [query, setQuery] = useState('');
|
||||||
const [results, setResults] = useState<SearchResult[]>([]);
|
const [results, setResults] = useState<SearchResult[]>([]);
|
||||||
const [recentSearches, setRecentSearches] = useState<SearchResult[]>(readRecentSearches);
|
const [recentSearches, setRecentSearches] = useState<SearchResult[]>(readRecentSearches);
|
||||||
|
|
@ -165,6 +258,9 @@ export function useLocationSearch(mode?: string) {
|
||||||
const debounceRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
const debounceRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
||||||
const latestQueryRef = useRef('');
|
const latestQueryRef = useRef('');
|
||||||
const lastResultsRef = useRef<SearchResult[]>([]);
|
const lastResultsRef = useRef<SearchResult[]>([]);
|
||||||
|
// Held in a ref so a non-memoized callback from the parent doesn't churn handleInputChange.
|
||||||
|
const getViewportCenterRef = useRef(getViewportCenter);
|
||||||
|
getViewportCenterRef.current = getViewportCenter;
|
||||||
|
|
||||||
const handleInputChange = useCallback(
|
const handleInputChange = useCallback(
|
||||||
(value: string) => {
|
(value: string) => {
|
||||||
|
|
@ -212,6 +308,11 @@ export function useLocationSearch(mode?: string) {
|
||||||
try {
|
try {
|
||||||
const params = new URLSearchParams({ q: trimmed });
|
const params = new URLSearchParams({ q: trimmed });
|
||||||
if (mode) params.set('mode', mode);
|
if (mode) params.set('mode', mode);
|
||||||
|
const center = getViewportCenterRef.current?.();
|
||||||
|
if (center) {
|
||||||
|
params.set('lat', String(center.lat));
|
||||||
|
params.set('lng', String(center.lng));
|
||||||
|
}
|
||||||
const res = await fetch(
|
const res = await fetch(
|
||||||
`/api/places?${params}`,
|
`/api/places?${params}`,
|
||||||
authHeaders({ signal: controller.signal })
|
authHeaders({ signal: controller.signal })
|
||||||
|
|
@ -223,47 +324,19 @@ export function useLocationSearch(mode?: string) {
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const json: {
|
const json: PlacesApiResponse = await res.json();
|
||||||
places: PlaceResult[];
|
|
||||||
postcodes?: string[];
|
|
||||||
addresses?: AddressResult[];
|
|
||||||
} = await res.json();
|
|
||||||
const placeResults = json.places.map((p) => ({
|
|
||||||
type: 'place' as const,
|
|
||||||
name: p.name,
|
|
||||||
slug: p.slug,
|
|
||||||
place_type: p.place_type,
|
|
||||||
lat: p.lat,
|
|
||||||
lon: p.lon,
|
|
||||||
city: p.city === 'City of London' ? 'London' : p.city,
|
|
||||||
}));
|
|
||||||
const outcodeResults = placeResults.filter((result) => result.place_type === 'outcode');
|
|
||||||
const otherPlaceResults = placeResults.filter(
|
|
||||||
(result) => result.place_type !== 'outcode'
|
|
||||||
);
|
|
||||||
const postcodeResults: SearchResult[] = (json.postcodes ?? []).map((postcode) => ({
|
|
||||||
type: 'postcode' as const,
|
|
||||||
label: postcode,
|
|
||||||
}));
|
|
||||||
const addressResults: SearchResult[] = (json.addresses ?? []).map((address) => ({
|
|
||||||
type: 'address' as const,
|
|
||||||
address: address.address,
|
|
||||||
postcode: address.postcode,
|
|
||||||
lat: address.lat,
|
|
||||||
lon: address.lon,
|
|
||||||
}));
|
|
||||||
const containsHouseNumber = /\d/.test(trimmed);
|
|
||||||
const combinedResults = (
|
const combinedResults = (
|
||||||
containsHouseNumber
|
Array.isArray(json.results)
|
||||||
? [...outcodeResults, ...postcodeResults, ...addressResults, ...otherPlaceResults]
|
? json.results.map(unifiedToSearchResult).filter(isNonNull)
|
||||||
: [...outcodeResults, ...postcodeResults, ...otherPlaceResults, ...addressResults]
|
: legacyCombineResults(json, trimmed)
|
||||||
).slice(0, 20);
|
).slice(0, 20);
|
||||||
if (controller.signal.aborted || latestQueryRef.current.trim() !== trimmed) {
|
if (controller.signal.aborted || latestQueryRef.current.trim() !== trimmed) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
lastResultsRef.current = combinedResults;
|
lastResultsRef.current = combinedResults;
|
||||||
const matchingResults = filterResultsForQuery(combinedResults, trimmed);
|
// Trust the server's unified ranking — re-filtering here previously dropped valid
|
||||||
setResults(matchingResults);
|
// alias and partial-postcode matches. The optimistic pre-fetch path still filters.
|
||||||
|
setResults(combinedResults);
|
||||||
setOpen(true);
|
setOpen(true);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
logNonAbortError('places search', err);
|
logNonAbortError('places search', err);
|
||||||
|
|
|
||||||
|
|
@ -178,7 +178,11 @@ describe('resolveTransitVariant', () => {
|
||||||
|
|
||||||
describe('parseServerMode', () => {
|
describe('parseServerMode', () => {
|
||||||
it('round-trips the four toggle-reachable variants', () => {
|
it('round-trips the four toggle-reachable variants', () => {
|
||||||
expect(parseServerMode('transit')).toEqual({ mode: 'transit', noChange: false, noBuses: false });
|
expect(parseServerMode('transit')).toEqual({
|
||||||
|
mode: 'transit',
|
||||||
|
noChange: false,
|
||||||
|
noBuses: false,
|
||||||
|
});
|
||||||
expect(parseServerMode('transit-no-bus')).toEqual({
|
expect(parseServerMode('transit-no-bus')).toEqual({
|
||||||
mode: 'transit',
|
mode: 'transit',
|
||||||
noChange: false,
|
noChange: false,
|
||||||
|
|
@ -198,8 +202,16 @@ describe('parseServerMode', () => {
|
||||||
|
|
||||||
it('parses non-transit base modes', () => {
|
it('parses non-transit base modes', () => {
|
||||||
expect(parseServerMode('car')).toEqual({ mode: 'car', noChange: false, noBuses: false });
|
expect(parseServerMode('car')).toEqual({ mode: 'car', noChange: false, noBuses: false });
|
||||||
expect(parseServerMode('bicycle')).toEqual({ mode: 'bicycle', noChange: false, noBuses: false });
|
expect(parseServerMode('bicycle')).toEqual({
|
||||||
expect(parseServerMode('walking')).toEqual({ mode: 'walking', noChange: false, noBuses: false });
|
mode: 'bicycle',
|
||||||
|
noChange: false,
|
||||||
|
noBuses: false,
|
||||||
|
});
|
||||||
|
expect(parseServerMode('walking')).toEqual({
|
||||||
|
mode: 'walking',
|
||||||
|
noChange: false,
|
||||||
|
noBuses: false,
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
it('returns null for variants the UI cannot represent (no silent broadening)', () => {
|
it('returns null for variants the UI cannot represent (no silent broadening)', () => {
|
||||||
|
|
|
||||||
|
|
@ -138,7 +138,15 @@ export function useTravelTime(initial?: TravelTimeInitial) {
|
||||||
const handleAddEntry = useCallback((mode: TransportMode) => {
|
const handleAddEntry = useCallback((mode: TransportMode) => {
|
||||||
setEntries((prev) => [
|
setEntries((prev) => [
|
||||||
...prev,
|
...prev,
|
||||||
{ mode, slug: '', label: '', timeRange: null, useBest: false, noChange: false, noBuses: false },
|
{
|
||||||
|
mode,
|
||||||
|
slug: '',
|
||||||
|
label: '',
|
||||||
|
timeRange: null,
|
||||||
|
useBest: false,
|
||||||
|
noChange: false,
|
||||||
|
noBuses: false,
|
||||||
|
},
|
||||||
]);
|
]);
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,8 @@ const descriptions: Record<string, Record<string, string>> = {
|
||||||
'Est. price per sqm': 'Prix actuel estimé rapporté à la surface totale',
|
'Est. price per sqm': 'Prix actuel estimé rapporté à la surface totale',
|
||||||
'Estimated monthly rent': 'Loyer mensuel privé moyen dans le secteur',
|
'Estimated monthly rent': 'Loyer mensuel privé moyen dans le secteur',
|
||||||
'Total floor area (sqm)': 'Surface intérieure relevée lors du diagnostic EPC',
|
'Total floor area (sqm)': 'Surface intérieure relevée lors du diagnostic EPC',
|
||||||
'Number of bedrooms & living rooms': 'Nombre de pièces habitables relevé lors du diagnostic EPC',
|
'Number of bedrooms & living rooms':
|
||||||
|
'Nombre de pièces habitables relevé lors du diagnostic EPC',
|
||||||
'Construction year': 'Année de construction estimée à partir de l’EPC',
|
'Construction year': 'Année de construction estimée à partir de l’EPC',
|
||||||
'Date of last transaction': 'Date de la vente la plus récente enregistrée au Land Registry',
|
'Date of last transaction': 'Date de la vente la plus récente enregistrée au Land Registry',
|
||||||
'Former council house': 'Indique si le bien a déjà été répertorié comme logement social',
|
'Former council house': 'Indique si le bien a déjà été répertorié comme logement social',
|
||||||
|
|
@ -30,7 +31,8 @@ const descriptions: Record<string, Record<string, string>> = {
|
||||||
'Potential energy rating':
|
'Potential energy rating':
|
||||||
'Classe énergétique EPC possible si toutes les améliorations recommandées étaient réalisées',
|
'Classe énergétique EPC possible si toutes les améliorations recommandées étaient réalisées',
|
||||||
'Interior height (m)': 'Hauteur intérieure moyenne relevée lors du diagnostic EPC',
|
'Interior height (m)': 'Hauteur intérieure moyenne relevée lors du diagnostic EPC',
|
||||||
'Street tree density percentile': 'Percentile estimé de couverture arborée autour du code postal',
|
'Street tree density percentile':
|
||||||
|
'Percentile estimé de couverture arborée autour du code postal',
|
||||||
'Within conservation area':
|
'Within conservation area':
|
||||||
'Indique si le point représentatif du code postal se situe dans une zone de conservation désignée',
|
'Indique si le point représentatif du code postal se situe dans une zone de conservation désignée',
|
||||||
'Listed building':
|
'Listed building':
|
||||||
|
|
@ -67,7 +69,8 @@ const descriptions: Record<string, Record<string, string>> = {
|
||||||
'Moyenne annuelle des violences et infractions sexuelles dans le secteur',
|
'Moyenne annuelle des violences et infractions sexuelles dans le secteur',
|
||||||
'Burglary (avg/yr)': 'Moyenne annuelle des cambriolages dans le secteur',
|
'Burglary (avg/yr)': 'Moyenne annuelle des cambriolages dans le secteur',
|
||||||
'Robbery (avg/yr)': 'Moyenne annuelle des vols avec violence dans le secteur',
|
'Robbery (avg/yr)': 'Moyenne annuelle des vols avec violence dans le secteur',
|
||||||
'Vehicle crime (avg/yr)': 'Moyenne annuelle des infractions liées aux véhicules dans le secteur',
|
'Vehicle crime (avg/yr)':
|
||||||
|
'Moyenne annuelle des infractions liées aux véhicules dans le secteur',
|
||||||
'Anti-social behaviour (avg/yr)':
|
'Anti-social behaviour (avg/yr)':
|
||||||
'Moyenne annuelle des comportements antisociaux dans le secteur',
|
'Moyenne annuelle des comportements antisociaux dans le secteur',
|
||||||
'Criminal damage and arson (avg/yr)':
|
'Criminal damage and arson (avg/yr)':
|
||||||
|
|
@ -89,8 +92,7 @@ const descriptions: Record<string, Record<string, string>> = {
|
||||||
'% Mixed':
|
'% Mixed':
|
||||||
'Part de la population s’identifiant comme métisse ou de plusieurs groupes ethniques',
|
'Part de la population s’identifiant comme métisse ou de plusieurs groupes ethniques',
|
||||||
'% Other': 'Part de la population s’identifiant comme appartenant à un autre groupe ethnique',
|
'% Other': 'Part de la population s’identifiant comme appartenant à un autre groupe ethnique',
|
||||||
'Voter turnout (%)':
|
'Voter turnout (%)': 'Part des électeurs inscrits ayant voté aux élections générales de 2024',
|
||||||
'Part des électeurs inscrits ayant voté aux élections générales de 2024',
|
|
||||||
'% Labour': 'Part des voix travaillistes aux élections générales de 2024',
|
'% Labour': 'Part des voix travaillistes aux élections générales de 2024',
|
||||||
'% Conservative': 'Part des voix conservatrices aux élections générales de 2024',
|
'% Conservative': 'Part des voix conservatrices aux élections générales de 2024',
|
||||||
'% Liberal Democrat': 'Part des voix libérales-démocrates aux élections générales de 2024',
|
'% Liberal Democrat': 'Part des voix libérales-démocrates aux élections générales de 2024',
|
||||||
|
|
@ -98,8 +100,10 @@ const descriptions: Record<string, Record<string, string>> = {
|
||||||
'% Green': 'Part des voix vertes aux élections générales de 2024',
|
'% Green': 'Part des voix vertes aux élections générales de 2024',
|
||||||
'% Other parties': 'Part cumulée des voix de tous les autres partis et indépendants',
|
'% Other parties': 'Part cumulée des voix de tous les autres partis et indépendants',
|
||||||
'Distance to nearest park (km)': 'Distance au parc ou espace vert le plus proche',
|
'Distance to nearest park (km)': 'Distance au parc ou espace vert le plus proche',
|
||||||
'Noise (dB)': 'Le plus élevé des bruits routier, ferroviaire ou aérien près du code postal, en décibels (Lden). Angleterre uniquement ; vide = hors zone cartographiée, pas forcément calme.',
|
'Noise (dB)':
|
||||||
'Max available download speed (Mbps)': 'Débit descendant haut débit maximal disponible au code postal',
|
'Le plus élevé des bruits routier, ferroviaire ou aérien près du code postal, en décibels (Lden). Angleterre uniquement ; vide = hors zone cartographiée, pas forcément calme.',
|
||||||
|
'Max available download speed (Mbps)':
|
||||||
|
'Débit descendant haut débit maximal disponible au code postal',
|
||||||
Schools: 'Écoles primaires et secondaires notées à proximité',
|
Schools: 'Écoles primaires et secondaires notées à proximité',
|
||||||
'Specific crimes': 'Filtrer une seule catégorie d’infractions de rue à la fois',
|
'Specific crimes': 'Filtrer une seule catégorie d’infractions de rue à la fois',
|
||||||
Ethnicities: 'Part de la population par groupe ethnique',
|
Ethnicities: 'Part de la population par groupe ethnique',
|
||||||
|
|
@ -152,8 +156,7 @@ const descriptions: Record<string, Record<string, string>> = {
|
||||||
'Education, Skills and Training Score':
|
'Education, Skills and Training Score':
|
||||||
'Benachteiligungsperzentil für Bildung, Kompetenzen und Ausbildung (höher = weniger benachteiligt)',
|
'Benachteiligungsperzentil für Bildung, Kompetenzen und Ausbildung (höher = weniger benachteiligt)',
|
||||||
'Income Score': 'Einkommensbenachteiligungsperzentil (höher = weniger benachteiligt)',
|
'Income Score': 'Einkommensbenachteiligungsperzentil (höher = weniger benachteiligt)',
|
||||||
'Employment Score':
|
'Employment Score': 'Beschäftigungsbenachteiligungsperzentil (höher = weniger benachteiligt)',
|
||||||
'Beschäftigungsbenachteiligungsperzentil (höher = weniger benachteiligt)',
|
|
||||||
'Health Deprivation and Disability Score':
|
'Health Deprivation and Disability Score':
|
||||||
'Perzentil für gesundheitliche Benachteiligung und Behinderung (höher = bessere Ergebnisse)',
|
'Perzentil für gesundheitliche Benachteiligung und Behinderung (höher = bessere Ergebnisse)',
|
||||||
'Housing Conditions Score': 'Wohnbedingungen-Perzentil (höher = bessere Bedingungen)',
|
'Housing Conditions Score': 'Wohnbedingungen-Perzentil (höher = bessere Bedingungen)',
|
||||||
|
|
@ -198,7 +201,8 @@ const descriptions: Record<string, Record<string, string>> = {
|
||||||
'% Green': 'Stimmenanteil der Grünen bei der Parlamentswahl 2024',
|
'% Green': 'Stimmenanteil der Grünen bei der Parlamentswahl 2024',
|
||||||
'% Other parties': 'Kombinierter Stimmenanteil aller anderen Parteien und Unabhängigen',
|
'% Other parties': 'Kombinierter Stimmenanteil aller anderen Parteien und Unabhängigen',
|
||||||
'Distance to nearest park (km)': 'Entfernung zum nächsten Park oder Grünfläche',
|
'Distance to nearest park (km)': 'Entfernung zum nächsten Park oder Grünfläche',
|
||||||
'Noise (dB)': 'Lautester von Straßen-, Bahn- oder Fluglärm in der Nähe des Postcodes in Dezibel (Lden). Nur England; leer = nicht kartiert, nicht unbedingt leise.',
|
'Noise (dB)':
|
||||||
|
'Lautester von Straßen-, Bahn- oder Fluglärm in der Nähe des Postcodes in Dezibel (Lden). Nur England; leer = nicht kartiert, nicht unbedingt leise.',
|
||||||
'Max available download speed (Mbps)':
|
'Max available download speed (Mbps)':
|
||||||
'Maximal verfügbare Breitband-Downloadgeschwindigkeit am Postcode',
|
'Maximal verfügbare Breitband-Downloadgeschwindigkeit am Postcode',
|
||||||
Schools: 'Bewertete Grundschulen und weiterführende Schulen in der Nähe',
|
Schools: 'Bewertete Grundschulen und weiterführende Schulen in der Nähe',
|
||||||
|
|
@ -273,7 +277,8 @@ const descriptions: Record<string, Record<string, string>> = {
|
||||||
'% Green': '2024 年大选中绿党得票率',
|
'% Green': '2024 年大选中绿党得票率',
|
||||||
'% Other parties': '所有其他政党和独立候选人的综合得票率',
|
'% Other parties': '所有其他政党和独立候选人的综合得票率',
|
||||||
'Distance to nearest park (km)': '到最近公园或绿地的距离',
|
'Distance to nearest park (km)': '到最近公园或绿地的距离',
|
||||||
'Noise (dB)': '该邮编附近道路、铁路或机场中最高的噪音水平(Lden,分贝)。仅英格兰;空白表示未覆盖,不一定安静。',
|
'Noise (dB)':
|
||||||
|
'该邮编附近道路、铁路或机场中最高的噪音水平(Lden,分贝)。仅英格兰;空白表示未覆盖,不一定安静。',
|
||||||
'Max available download speed (Mbps)': '该邮编可用的最高宽带下载速度',
|
'Max available download speed (Mbps)': '该邮编可用的最高宽带下载速度',
|
||||||
Schools: '附近有评级的小学和中学',
|
Schools: '附近有评级的小学和中学',
|
||||||
'Specific crimes': '一次筛选一种街面犯罪类别',
|
'Specific crimes': '一次筛选一种街面犯罪类别',
|
||||||
|
|
@ -358,7 +363,8 @@ const descriptions: Record<string, Record<string, string>> = {
|
||||||
'% Green': '2024 आम चुनाव में ग्रीन पार्टी का मत-प्रतिशत',
|
'% Green': '2024 आम चुनाव में ग्रीन पार्टी का मत-प्रतिशत',
|
||||||
'% Other parties': 'बाकी सभी पार्टियों और निर्दलीयों का संयुक्त मत-प्रतिशत',
|
'% Other parties': 'बाकी सभी पार्टियों और निर्दलीयों का संयुक्त मत-प्रतिशत',
|
||||||
'Distance to nearest park (km)': 'निकटतम पार्क या हरित क्षेत्र तक दूरी',
|
'Distance to nearest park (km)': 'निकटतम पार्क या हरित क्षेत्र तक दूरी',
|
||||||
'Noise (dB)': 'पोस्टकोड के पास सड़क, रेल या हवाई अड्डे के शोर में सबसे अधिक, डेसीबल (Lden) में। केवल इंग्लैंड; खाली = मैप नहीं किया गया, जरूरी नहीं कि शांत हो।',
|
'Noise (dB)':
|
||||||
|
'पोस्टकोड के पास सड़क, रेल या हवाई अड्डे के शोर में सबसे अधिक, डेसीबल (Lden) में। केवल इंग्लैंड; खाली = मैप नहीं किया गया, जरूरी नहीं कि शांत हो।',
|
||||||
'Max available download speed (Mbps)': 'पोस्टकोड पर उपलब्ध अधिकतम डाउनलोड गति',
|
'Max available download speed (Mbps)': 'पोस्टकोड पर उपलब्ध अधिकतम डाउनलोड गति',
|
||||||
Schools: 'पास के रेटेड प्राइमरी और सेकेंडरी स्कूल',
|
Schools: 'पास के रेटेड प्राइमरी और सेकेंडरी स्कूल',
|
||||||
'Specific crimes': 'एक समय में एक सड़क-स्तर अपराध श्रेणी से फिल्टर करें',
|
'Specific crimes': 'एक समय में एक सड़क-स्तर अपराध श्रेणी से फिल्टर करें',
|
||||||
|
|
@ -450,7 +456,8 @@ const descriptions: Record<string, Record<string, string>> = {
|
||||||
'% Green': 'A Zöld Párt szavazataránya a 2024-es parlamenti választáson',
|
'% Green': 'A Zöld Párt szavazataránya a 2024-es parlamenti választáson',
|
||||||
'% Other parties': 'Az összes többi párt és független jelölt összesített szavazataránya',
|
'% Other parties': 'Az összes többi párt és független jelölt összesített szavazataránya',
|
||||||
'Distance to nearest park (km)': 'Távolság a legközelebbi parkig vagy zöldterületig',
|
'Distance to nearest park (km)': 'Távolság a legközelebbi parkig vagy zöldterületig',
|
||||||
'Noise (dB)': 'Az út-, vasúti vagy repülőtéri zaj közül a leghangosabb az irányítószámnál, decibelben (Lden). Csak Anglia; üres = nem térképezett, nem feltétlenül csendes.',
|
'Noise (dB)':
|
||||||
|
'Az út-, vasúti vagy repülőtéri zaj közül a leghangosabb az irányítószámnál, decibelben (Lden). Csak Anglia; üres = nem térképezett, nem feltétlenül csendes.',
|
||||||
'Max available download speed (Mbps)':
|
'Max available download speed (Mbps)':
|
||||||
'Az irányítószámnál elérhető maximális szélessávú letöltési sebesség',
|
'Az irányítószámnál elérhető maximális szélessávú letöltési sebesség',
|
||||||
Schools: 'Közeli minősített általános és középiskolák',
|
Schools: 'Közeli minősített általános és középiskolák',
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ export const details: Record<string, Record<string, string>> = {
|
||||||
'Property type':
|
'Property type':
|
||||||
'Données HM Land Registry Price Paid et certificats EPC. Maison individuelle, maison jumelée, maison mitoyenne (tous les sous-types de maisons en rangée), appartement/maisonette, ou autre type (bungalows, park homes, etc.).',
|
'Données HM Land Registry Price Paid et certificats EPC. Maison individuelle, maison jumelée, maison mitoyenne (tous les sous-types de maisons en rangée), appartement/maisonette, ou autre type (bungalows, park homes, etc.).',
|
||||||
'Leasehold/Freehold':
|
'Leasehold/Freehold':
|
||||||
"Données HM Land Registry Price Paid. Freehold signifie que vous possédez le bâtiment et le terrain sur lequel il se trouve. Leasehold signifie que vous possédez le bâtiment mais pas le terrain : vous détenez un bail accordé par le freeholder pour une durée déterminée.",
|
'Données HM Land Registry Price Paid. Freehold signifie que vous possédez le bâtiment et le terrain sur lequel il se trouve. Leasehold signifie que vous possédez le bâtiment mais pas le terrain : vous détenez un bail accordé par le freeholder pour une durée déterminée.',
|
||||||
'Last known price':
|
'Last known price':
|
||||||
"Le dernier prix de vente enregistré pour ce bien provenant des données HM Land Registry Price Paid. Couvre les ventes résidentielles en Angleterre. Peut dater de plusieurs années si le bien n'a pas été vendu récemment.",
|
"Le dernier prix de vente enregistré pour ce bien provenant des données HM Land Registry Price Paid. Couvre les ventes résidentielles en Angleterre. Peut dater de plusieurs années si le bien n'a pas été vendu récemment.",
|
||||||
'Estimated current price':
|
'Estimated current price':
|
||||||
|
|
@ -72,7 +72,7 @@ export const details: Record<string, Record<string, string>> = {
|
||||||
'Serious crime (avg/yr)':
|
'Serious crime (avg/yr)':
|
||||||
"Somme annuelle des violences, robberies, burglaries et possessions d'armes dans un rayon de 50 m du code postal, comptée à partir des points de criminalité street-level de police.uk (anonymisés et rattachés à des points cartographiques proches). Fournit un indicateur unique de criminalité grave.",
|
"Somme annuelle des violences, robberies, burglaries et possessions d'armes dans un rayon de 50 m du code postal, comptée à partir des points de criminalité street-level de police.uk (anonymisés et rattachés à des points cartographiques proches). Fournit un indicateur unique de criminalité grave.",
|
||||||
'Minor crime (avg/yr)':
|
'Minor crime (avg/yr)':
|
||||||
"Somme annuelle des comportements antisociaux, shoplifting, vols de vélos et autres infractions de moindre gravité dans un rayon de 50 m du code postal, comptée à partir des points de criminalité street-level de police.uk (anonymisés et rattachés à des points cartographiques proches). Fournit un indicateur unique de criminalité mineure.",
|
'Somme annuelle des comportements antisociaux, shoplifting, vols de vélos et autres infractions de moindre gravité dans un rayon de 50 m du code postal, comptée à partir des points de criminalité street-level de police.uk (anonymisés et rattachés à des points cartographiques proches). Fournit un indicateur unique de criminalité mineure.',
|
||||||
'Violence and sexual offences (avg/yr)':
|
'Violence and sexual offences (avg/yr)':
|
||||||
'Nombre moyen annuel de violences et infractions sexuelles dans un rayon de 50 m du code postal, d’après les données de criminalité street-level de police.uk. Inclut les agressions, le harcèlement et les infractions sexuelles.',
|
'Nombre moyen annuel de violences et infractions sexuelles dans un rayon de 50 m du code postal, d’après les données de criminalité street-level de police.uk. Inclut les agressions, le harcèlement et les infractions sexuelles.',
|
||||||
'Burglary (avg/yr)':
|
'Burglary (avg/yr)':
|
||||||
|
|
|
||||||
|
|
@ -2,9 +2,12 @@
|
||||||
|
|
||||||
A travel-time parquet file is considered corrupted when the R5 routing
|
A travel-time parquet file is considered corrupted when the R5 routing
|
||||||
computation failed or was interrupted, leaving either zero rows or only
|
computation failed or was interrupted, leaving either zero rows or only
|
||||||
the origin postcode. We detect this by comparing each file's row count
|
the origin postcode. We detect this by an absolute, structural criterion:
|
||||||
against a per-mode threshold derived from the 5th-percentile of all files
|
a file is corrupt only when it is unreadable or has a row count at or below
|
||||||
in that mode. Files at or below 1 row are always flagged.
|
CORRUPT_ROW_FLOOR. Per-mode percentile/median/range figures are reported
|
||||||
|
for context only — they never drive the deletable set, so repeated runs
|
||||||
|
(including with --delete) are idempotent and never erode legitimate
|
||||||
|
small-catchment (rural/island) origins.
|
||||||
|
|
||||||
Duplicates arise when places.parquet is rebuilt between R5 runs — each
|
Duplicates arise when places.parquet is rebuilt between R5 runs — each
|
||||||
place gets a new numeric index prefix, so the skip-completed logic
|
place gets a new numeric index prefix, so the skip-completed logic
|
||||||
|
|
@ -13,7 +16,6 @@ file per slug and removes the rest.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
uv run python pipeline/check_travel_times.py [--travel-times property-data/travel-times]
|
uv run python pipeline/check_travel_times.py [--travel-times property-data/travel-times]
|
||||||
[--threshold-pct 5]
|
|
||||||
[--delete]
|
[--delete]
|
||||||
[--dedup]
|
[--dedup]
|
||||||
"""
|
"""
|
||||||
|
|
@ -28,6 +30,12 @@ from pathlib import Path
|
||||||
|
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
|
# Absolute row-count floor for corruption: a readable file with this many
|
||||||
|
# rows or fewer holds at most the origin postcode (R5 failed/interrupted).
|
||||||
|
# This is a structural threshold, NOT a population percentile, so repeated
|
||||||
|
# runs are idempotent and never delete a fresh fraction of valid files.
|
||||||
|
CORRUPT_ROW_FLOOR = 1
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class BadFile:
|
class BadFile:
|
||||||
|
|
@ -74,10 +82,14 @@ def percentile(values: list[int], pct: float) -> float:
|
||||||
return s[lo] + frac * (s[hi] - s[lo])
|
return s[lo] + frac * (s[hi] - s[lo])
|
||||||
|
|
||||||
|
|
||||||
def find_bad_files(
|
def find_bad_files(base_dir: Path) -> tuple[list[BadFile], dict[str, dict]]:
|
||||||
base_dir: Path, threshold_pct: float
|
"""Scan all modes and return bad files + per-mode stats.
|
||||||
) -> tuple[list[BadFile], dict[str, dict]]:
|
|
||||||
"""Scan all modes and return bad files + per-mode stats."""
|
A file is "bad" (deletable) only by an absolute structural criterion:
|
||||||
|
it is unreadable (rows < 0) or holds at most the origin postcode
|
||||||
|
(rows <= CORRUPT_ROW_FLOOR). The p5/median/min/max figures are computed
|
||||||
|
purely for reporting and do NOT influence the deletable set.
|
||||||
|
"""
|
||||||
bad: list[BadFile] = []
|
bad: list[BadFile] = []
|
||||||
stats: dict[str, dict] = {}
|
stats: dict[str, dict] = {}
|
||||||
|
|
||||||
|
|
@ -93,15 +105,14 @@ def find_bad_files(
|
||||||
if not row_counts:
|
if not row_counts:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
p5 = percentile(row_counts, threshold_pct)
|
# Reporting statistics only — these never decide what gets deleted.
|
||||||
|
p5 = percentile(row_counts, 5)
|
||||||
median = percentile(row_counts, 50)
|
median = percentile(row_counts, 50)
|
||||||
# Threshold: max of 1 and the chosen percentile — ensures we always
|
|
||||||
# catch files with 0-1 rows even if p5 is 0 (e.g. walking mode).
|
|
||||||
threshold = max(1, int(p5))
|
|
||||||
|
|
||||||
mode_bad = []
|
mode_bad = []
|
||||||
for filename, slug, rows in entries:
|
for filename, slug, rows in entries:
|
||||||
if rows <= threshold:
|
# Corrupt = unreadable, or at/below the absolute origin-only floor.
|
||||||
|
if rows < 0 or rows <= CORRUPT_ROW_FLOOR:
|
||||||
bf = BadFile(mode=mode, filename=filename, slug=slug, rows=rows)
|
bf = BadFile(mode=mode, filename=filename, slug=slug, rows=rows)
|
||||||
mode_bad.append(bf)
|
mode_bad.append(bf)
|
||||||
bad.append(bf)
|
bad.append(bf)
|
||||||
|
|
@ -110,7 +121,7 @@ def find_bad_files(
|
||||||
"total": len(entries),
|
"total": len(entries),
|
||||||
"errors": errors,
|
"errors": errors,
|
||||||
"bad": len(mode_bad),
|
"bad": len(mode_bad),
|
||||||
"threshold": threshold,
|
"floor": CORRUPT_ROW_FLOOR,
|
||||||
"p5": p5,
|
"p5": p5,
|
||||||
"median": median,
|
"median": median,
|
||||||
"min": min(row_counts),
|
"min": min(row_counts),
|
||||||
|
|
@ -169,16 +180,13 @@ def main() -> None:
|
||||||
default=Path("property-data/travel-times"),
|
default=Path("property-data/travel-times"),
|
||||||
help="Path to travel-times directory",
|
help="Path to travel-times directory",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--threshold-pct",
|
|
||||||
type=float,
|
|
||||||
default=5,
|
|
||||||
help="Percentile below which files are flagged (default: 5th)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--delete",
|
"--delete",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Delete corrupted files (so R5 will recompute them)",
|
help=(
|
||||||
|
"Delete corrupted files (unreadable or "
|
||||||
|
f"<= {CORRUPT_ROW_FLOOR} row) so R5 will recompute them"
|
||||||
|
),
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--dedup",
|
"--dedup",
|
||||||
|
|
@ -192,18 +200,20 @@ def main() -> None:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# --- Corruption check ---
|
# --- Corruption check ---
|
||||||
bad_files, stats = find_bad_files(args.travel_times, args.threshold_pct)
|
bad_files, stats = find_bad_files(args.travel_times)
|
||||||
|
|
||||||
print("=== Per-mode summary ===\n")
|
print("=== Per-mode summary ===\n")
|
||||||
|
# Floor is the absolute deletion threshold; p5/median/range are reporting
|
||||||
|
# context only and do not affect which files are flagged as corrupt.
|
||||||
print(
|
print(
|
||||||
f"{'Mode':<10} {'Total':>6} {'Bad':>5} {'Threshold':>10} {'Median':>8} {'Range':>20}"
|
f"{'Mode':<10} {'Total':>6} {'Bad':>5} {'Floor':>6} {'P5':>8} {'Median':>8} {'Range':>20}"
|
||||||
)
|
)
|
||||||
print("-" * 65)
|
print("-" * 71)
|
||||||
for mode, s in sorted(stats.items()):
|
for mode, s in sorted(stats.items()):
|
||||||
rng = f"{s['min']:,}–{s['max']:,}"
|
rng = f"{s['min']:,}–{s['max']:,}"
|
||||||
print(
|
print(
|
||||||
f"{mode:<10} {s['total']:>6} {s['bad']:>5} {s['threshold']:>10,} "
|
f"{mode:<10} {s['total']:>6} {s['bad']:>5} {s['floor']:>6,} "
|
||||||
f"{s['median']:>8,.0f} {rng:>20}"
|
f"{s['p5']:>8,.0f} {s['median']:>8,.0f} {rng:>20}"
|
||||||
)
|
)
|
||||||
|
|
||||||
if bad_files:
|
if bad_files:
|
||||||
|
|
|
||||||
|
|
@ -51,8 +51,16 @@ def _obtain_zip(dest: Path) -> None:
|
||||||
|
|
||||||
|
|
||||||
def convert_to_parquet(extract_dir: Path, parquet_path: Path) -> None:
|
def convert_to_parquet(extract_dir: Path, parquet_path: Path) -> None:
|
||||||
# Find CSV files in the extracted directory
|
# Find CSV files in the extracted directory. The zip ships two sibling
|
||||||
csv_files = list(extract_dir.rglob("*.csv"))
|
# dirs with identical headers: postcode_files/ (all premises) and
|
||||||
|
# postcode_res_files/ (residential only). Take all-premises only so each
|
||||||
|
# postcode_space appears once; matching on the dir name keeps this
|
||||||
|
# resilient to the date-stamped top-level folder prefix.
|
||||||
|
csv_files = [
|
||||||
|
f
|
||||||
|
for f in extract_dir.rglob("*.csv")
|
||||||
|
if "postcode_res_files" not in f.parts
|
||||||
|
]
|
||||||
if not csv_files:
|
if not csv_files:
|
||||||
raise FileNotFoundError(f"No CSV files found in {extract_dir}")
|
raise FileNotFoundError(f"No CSV files found in {extract_dir}")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,12 @@ TUBE_STATION_MERGE_RADIUS_DEGREES = 0.01
|
||||||
|
|
||||||
STOP_TYPES = {
|
STOP_TYPES = {
|
||||||
"AIR": "Airport",
|
"AIR": "Airport",
|
||||||
|
# Ferry: FER/FBT are the terminal/berth nodes; FTD is a docking entrance.
|
||||||
|
"FER": "Ferry",
|
||||||
|
"FBT": "Ferry",
|
||||||
"FTD": "Ferry",
|
"FTD": "Ferry",
|
||||||
|
# Rail: RLY is the station node; RSE is a station entrance.
|
||||||
|
"RLY": "Rail station",
|
||||||
"RSE": "Rail station",
|
"RSE": "Rail station",
|
||||||
"BCT": "Bus stop",
|
"BCT": "Bus stop",
|
||||||
"BCE": "Bus station",
|
"BCE": "Bus station",
|
||||||
|
|
@ -26,6 +31,16 @@ STOP_TYPES = {
|
||||||
"MET": "Tube station",
|
"MET": "Tube station",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Stop types that are access/entrance nodes rather than the primary station or
|
||||||
|
# terminal node. During dedup the primary node (e.g. RLY/FER) wins so a station
|
||||||
|
# with both a station node and entrances yields one POI at the station node.
|
||||||
|
ENTRANCE_STOP_TYPES = {"RSE", "FTD"}
|
||||||
|
|
||||||
|
# Categories whose entrances/variants are merged into a single station-level POI
|
||||||
|
# by normalized name + area (like Tube stations), so an RLY node and its RSE
|
||||||
|
# entrances collapse to one POI at the station node.
|
||||||
|
STATION_MERGE_CATEGORIES = {TUBE_STATION_CATEGORY, "Rail station", "Ferry"}
|
||||||
|
|
||||||
|
|
||||||
OUTPUT_COLUMNS = ["id", "name", "category", "lat", "lng"]
|
OUTPUT_COLUMNS = ["id", "name", "category", "lat", "lng"]
|
||||||
|
|
||||||
|
|
@ -97,7 +112,9 @@ def _empty_output_frame() -> pl.DataFrame:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def station_name_score(name: str) -> tuple[int, int]:
|
def station_name_score(name: str, entrance: bool = False) -> tuple[int, int, int]:
|
||||||
|
# Prefer the primary station/terminal node over an entrance, then a name
|
||||||
|
# without a transport-mode suffix, then the shorter name.
|
||||||
lower = name.lower()
|
lower = name.lower()
|
||||||
suffix_penalty = int(
|
suffix_penalty = int(
|
||||||
lower.endswith(
|
lower.endswith(
|
||||||
|
|
@ -112,7 +129,7 @@ def station_name_score(name: str) -> tuple[int, int]:
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return (suffix_penalty, len(name))
|
return (int(entrance), suffix_penalty, len(name))
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|
@ -122,6 +139,7 @@ class StationAccumulator:
|
||||||
category: str
|
category: str
|
||||||
lat_sum: float
|
lat_sum: float
|
||||||
lng_sum: float
|
lng_sum: float
|
||||||
|
entrance: bool = False
|
||||||
count: int = 1
|
count: int = 1
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
@ -143,9 +161,13 @@ class StationAccumulator:
|
||||||
self.count += 1
|
self.count += 1
|
||||||
|
|
||||||
name = str(row["name"] or "")
|
name = str(row["name"] or "")
|
||||||
if station_name_score(name) < station_name_score(self.name):
|
entrance = bool(row.get("entrance"))
|
||||||
|
if station_name_score(name, entrance) < station_name_score(
|
||||||
|
self.name, self.entrance
|
||||||
|
):
|
||||||
self.id = str(row["id"] or "")
|
self.id = str(row["id"] or "")
|
||||||
self.name = name
|
self.name = name
|
||||||
|
self.entrance = entrance
|
||||||
|
|
||||||
|
|
||||||
def _station_from_row(row: dict[str, object]) -> StationAccumulator:
|
def _station_from_row(row: dict[str, object]) -> StationAccumulator:
|
||||||
|
|
@ -155,19 +177,23 @@ def _station_from_row(row: dict[str, object]) -> StationAccumulator:
|
||||||
category=str(row["category"] or ""),
|
category=str(row["category"] or ""),
|
||||||
lat_sum=float(row["lat"]),
|
lat_sum=float(row["lat"]),
|
||||||
lng_sum=float(row["lng"]),
|
lng_sum=float(row["lng"]),
|
||||||
|
entrance=bool(row.get("entrance")),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _deduplicate_tube_stations(df: pl.DataFrame) -> pl.DataFrame:
|
def _deduplicate_station_areas(df: pl.DataFrame) -> pl.DataFrame:
|
||||||
if len(df) == 0:
|
if len(df) == 0:
|
||||||
return _empty_output_frame()
|
return _empty_output_frame()
|
||||||
|
|
||||||
selected: list[StationAccumulator] = []
|
selected: list[StationAccumulator] = []
|
||||||
groups: dict[str, list[int]] = {}
|
groups: dict[tuple[str, str], list[int]] = {}
|
||||||
|
|
||||||
for row in df.iter_rows(named=True):
|
for row in df.iter_rows(named=True):
|
||||||
station_key = canonical_station_name(str(row["name"] or ""))
|
# Key by category so different modes sharing a name/area (e.g. a rail
|
||||||
if not station_key:
|
# station and a ferry terminal) are not merged into one POI.
|
||||||
|
category = str(row["category"] or "")
|
||||||
|
station_key = (category, canonical_station_name(str(row["name"] or "")))
|
||||||
|
if not station_key[1]:
|
||||||
selected.append(_station_from_row(row))
|
selected.append(_station_from_row(row))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
@ -198,7 +224,7 @@ def _deduplicate_tube_stations(df: pl.DataFrame) -> pl.DataFrame:
|
||||||
).select(OUTPUT_COLUMNS)
|
).select(OUTPUT_COLUMNS)
|
||||||
|
|
||||||
|
|
||||||
def _deduplicate_non_tube_stops(df: pl.DataFrame) -> pl.DataFrame:
|
def _deduplicate_local_stops(df: pl.DataFrame) -> pl.DataFrame:
|
||||||
if len(df) == 0:
|
if len(df) == 0:
|
||||||
return _empty_output_frame()
|
return _empty_output_frame()
|
||||||
|
|
||||||
|
|
@ -218,7 +244,10 @@ def _deduplicate_non_tube_stops(df: pl.DataFrame) -> pl.DataFrame:
|
||||||
.select(OUTPUT_COLUMNS)
|
.select(OUTPUT_COLUMNS)
|
||||||
)
|
)
|
||||||
if len(no_loc) > 0:
|
if len(no_loc) > 0:
|
||||||
frames.append(no_loc.select(OUTPUT_COLUMNS))
|
# Stops with no locality can't be deduped by locality, so merge genuine
|
||||||
|
# co-located duplicates (same name+category within the same small area)
|
||||||
|
# via the station-area logic, while keeping distinct far-apart stops.
|
||||||
|
frames.append(_deduplicate_station_areas(no_loc))
|
||||||
|
|
||||||
if not frames:
|
if not frames:
|
||||||
return _empty_output_frame()
|
return _empty_output_frame()
|
||||||
|
|
@ -227,14 +256,20 @@ def _deduplicate_non_tube_stops(df: pl.DataFrame) -> pl.DataFrame:
|
||||||
|
|
||||||
|
|
||||||
def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
|
def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
|
||||||
"""Deduplicate NaPTAN stops, with station-level merging for Tube POIs."""
|
"""Deduplicate NaPTAN stops, merging station/terminal entrances by area.
|
||||||
tube = df.filter(pl.col("category") == TUBE_STATION_CATEGORY)
|
|
||||||
other = df.filter(pl.col("category") != TUBE_STATION_CATEGORY)
|
Tube, rail and ferry POIs are merged to one record per station by
|
||||||
|
normalized name + area, with the primary station/terminal node (e.g. RLY,
|
||||||
|
FER) winning over an entrance node (RSE, FTD). Other stops are deduplicated
|
||||||
|
by exact name+category+locality.
|
||||||
|
"""
|
||||||
|
station = df.filter(pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
|
||||||
|
other = df.filter(~pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
|
||||||
|
|
||||||
return pl.concat(
|
return pl.concat(
|
||||||
[
|
[
|
||||||
_deduplicate_non_tube_stops(other),
|
_deduplicate_local_stops(other),
|
||||||
_deduplicate_tube_stations(tube),
|
_deduplicate_station_areas(station),
|
||||||
]
|
]
|
||||||
).select(OUTPUT_COLUMNS)
|
).select(OUTPUT_COLUMNS)
|
||||||
|
|
||||||
|
|
@ -263,6 +298,7 @@ def download_naptan(output: Path) -> None:
|
||||||
pl.col("Latitude").alias("lat"),
|
pl.col("Latitude").alias("lat"),
|
||||||
pl.col("Longitude").alias("lng"),
|
pl.col("Longitude").alias("lng"),
|
||||||
pl.col("NptgLocalityCode").alias("locality"),
|
pl.col("NptgLocalityCode").alias("locality"),
|
||||||
|
pl.col("StopType").is_in(list(ENTRANCE_STOP_TYPES)).alias("entrance"),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
import io
|
||||||
import tempfile
|
import tempfile
|
||||||
import polars as pl
|
import polars as pl
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
@ -14,10 +15,12 @@ URL = "https://assets.publishing.service.gov.uk/media/69c5269b4a06660f0854427b/M
|
||||||
def convert_to_parquet(csv_path: Path, parquet_path: Path) -> None:
|
def convert_to_parquet(csv_path: Path, parquet_path: Path) -> None:
|
||||||
print("Reading CSV...")
|
print("Reading CSV...")
|
||||||
|
|
||||||
|
# The gov.uk source is cp1252-encoded; decode explicitly so non-ASCII
|
||||||
|
# school names are not corrupted (see gias.py for the same approach).
|
||||||
|
text = csv_path.read_bytes().decode("cp1252")
|
||||||
df = pl.read_csv(
|
df = pl.read_csv(
|
||||||
csv_path,
|
io.StringIO(text),
|
||||||
infer_schema_length=10000,
|
infer_schema_length=10000,
|
||||||
encoding="utf8-lossy",
|
|
||||||
null_values=["NULL", "Not applicable"],
|
null_values=["NULL", "Not applicable"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -42,6 +42,41 @@ SEARCH_PLACE_TYPES = {
|
||||||
"island",
|
"island",
|
||||||
}
|
}
|
||||||
TRAVEL_DESTINATION_PLACE_TYPES = {"city"}
|
TRAVEL_DESTINATION_PLACE_TYPES = {"city"}
|
||||||
|
|
||||||
|
# Named OSM highways worth surfacing as searchable streets (N). Service roads, footways,
|
||||||
|
# cycleways and motorways are deliberately excluded.
|
||||||
|
SEARCHABLE_HIGHWAY_TYPES = {
|
||||||
|
"residential",
|
||||||
|
"unclassified",
|
||||||
|
"tertiary",
|
||||||
|
"tertiary_link",
|
||||||
|
"secondary",
|
||||||
|
"secondary_link",
|
||||||
|
"primary",
|
||||||
|
"primary_link",
|
||||||
|
"trunk",
|
||||||
|
"living_street",
|
||||||
|
"pedestrian",
|
||||||
|
}
|
||||||
|
|
||||||
|
# High-value named POIs (M) lifted from uk_pois.parquet into the gazetteer, mapped from the
|
||||||
|
# OSM "key/value" category onto a search place_type. Everyday shops/amenities are excluded.
|
||||||
|
HIGH_VALUE_POI_CATEGORIES = {
|
||||||
|
"leisure/park": "park",
|
||||||
|
"leisure/garden": "park",
|
||||||
|
"leisure/nature_reserve": "park",
|
||||||
|
"leisure/common": "park",
|
||||||
|
"tourism/attraction": "attraction",
|
||||||
|
"tourism/theme_park": "attraction",
|
||||||
|
"tourism/zoo": "attraction",
|
||||||
|
"tourism/museum": "attraction",
|
||||||
|
"tourism/gallery": "attraction",
|
||||||
|
"amenity/hospital": "hospital",
|
||||||
|
"healthcare/hospital": "hospital",
|
||||||
|
"shop/mall": "retail",
|
||||||
|
"shop/department_store": "retail",
|
||||||
|
}
|
||||||
|
|
||||||
ENGLAND_COUNTRY_CODE = "E92000001"
|
ENGLAND_COUNTRY_CODE = "E92000001"
|
||||||
LONDON_REGION_CODE = "E12000007"
|
LONDON_REGION_CODE = "E12000007"
|
||||||
LONDON_LAD_PREFIX = "E09"
|
LONDON_LAD_PREFIX = "E09"
|
||||||
|
|
@ -240,6 +275,139 @@ def _slugify_name(name: str) -> str:
|
||||||
return re.sub(r"\s+", "-", slug).strip("-")
|
return re.sub(r"\s+", "-", slug).strip("-")
|
||||||
|
|
||||||
|
|
||||||
|
def _street_centroid(coords: list[tuple[float, float]]) -> tuple[float, float] | None:
|
||||||
|
"""Average (lat, lon) of a way's vertices."""
|
||||||
|
if not coords:
|
||||||
|
return None
|
||||||
|
count = len(coords)
|
||||||
|
lat = sum(lat for lat, _ in coords) / count
|
||||||
|
lon = sum(lon for _, lon in coords) / count
|
||||||
|
return lat, lon
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_street_name(name: str) -> str:
|
||||||
|
"""Grouping key for a street name: collapse whitespace, lowercase."""
|
||||||
|
return re.sub(r"\s+", " ", name).strip().lower()
|
||||||
|
|
||||||
|
|
||||||
|
def _outcode_of_postcode(postcode: str) -> str:
|
||||||
|
"""Outward code (everything before the space) of a postcode, e.g. 'NW1' from 'NW1 6XE'."""
|
||||||
|
return postcode.split(" ", 1)[0] if postcode else ""
|
||||||
|
|
||||||
|
|
||||||
|
def _outcode_tree(postcodes_path: Path) -> tuple[cKDTree, list[str]]:
|
||||||
|
"""Build a nearest-neighbour index from postcode coordinates to their outcode, so each
|
||||||
|
street can be tagged with the outcode it sits in (used to disambiguate same-named roads)."""
|
||||||
|
df = (
|
||||||
|
pl.read_parquet(
|
||||||
|
postcodes_path, columns=["pcds", "lat", "long", "ctry25cd", "doterm"]
|
||||||
|
)
|
||||||
|
.filter((pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null())
|
||||||
|
.filter(pl.col("lat").is_not_null() & pl.col("long").is_not_null())
|
||||||
|
)
|
||||||
|
coords = np.column_stack(
|
||||||
|
[df["lat"].to_numpy().astype(np.float64), df["long"].to_numpy().astype(np.float64)]
|
||||||
|
)
|
||||||
|
outcodes = [_outcode_of_postcode(pc) for pc in df["pcds"].to_list()]
|
||||||
|
return cKDTree(coords), outcodes
|
||||||
|
|
||||||
|
|
||||||
|
def _build_street_places(
|
||||||
|
streets: list[dict],
|
||||||
|
tree: cKDTree,
|
||||||
|
outcodes: list[str],
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Group street segments by (normalized name, outcode), averaging centroids, so a road that
|
||||||
|
OSM splits into many segments becomes one searchable result per outcode it passes through."""
|
||||||
|
if not streets:
|
||||||
|
return []
|
||||||
|
|
||||||
|
coords = np.array([[street["lat"], street["lon"]] for street in streets], dtype=np.float64)
|
||||||
|
_, indices = tree.query(coords)
|
||||||
|
|
||||||
|
grouped: dict[tuple[str, str], dict] = {}
|
||||||
|
for street, postcode_idx in zip(streets, indices):
|
||||||
|
outcode = outcodes[postcode_idx]
|
||||||
|
key = (_normalize_street_name(street["name"]), outcode)
|
||||||
|
entry = grouped.get(key)
|
||||||
|
if entry is None:
|
||||||
|
grouped[key] = {
|
||||||
|
"name": street["name"],
|
||||||
|
"lat_sum": street["lat"],
|
||||||
|
"lon_sum": street["lon"],
|
||||||
|
"count": 1,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
entry["lat_sum"] += street["lat"]
|
||||||
|
entry["lon_sum"] += street["lon"]
|
||||||
|
entry["count"] += 1
|
||||||
|
|
||||||
|
places = []
|
||||||
|
for entry in grouped.values():
|
||||||
|
count = entry["count"]
|
||||||
|
places.append(
|
||||||
|
{
|
||||||
|
"name": entry["name"],
|
||||||
|
"place_type": "street",
|
||||||
|
"lat": entry["lat_sum"] / count,
|
||||||
|
"lon": entry["lon_sum"] / count,
|
||||||
|
"population": 0,
|
||||||
|
"travel_destination": False,
|
||||||
|
"display_city": None,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return sorted(places, key=lambda place: place["name"].lower())
|
||||||
|
|
||||||
|
|
||||||
|
def _pois_to_places(pois: pl.DataFrame) -> list[dict]:
|
||||||
|
"""Map high-value named POIs onto gazetteer place rows (M), de-duplicated by (name, type)."""
|
||||||
|
if pois.is_empty():
|
||||||
|
return []
|
||||||
|
|
||||||
|
seen: set[tuple[str, str]] = set()
|
||||||
|
places: list[dict] = []
|
||||||
|
for row in pois.iter_rows(named=True):
|
||||||
|
place_type = HIGH_VALUE_POI_CATEGORIES.get(str(row.get("category", "")))
|
||||||
|
if place_type is None:
|
||||||
|
continue
|
||||||
|
name = str(row.get("name") or "").strip()
|
||||||
|
if len(name) < 3:
|
||||||
|
continue
|
||||||
|
key = (name.lower(), place_type)
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
seen.add(key)
|
||||||
|
places.append(
|
||||||
|
{
|
||||||
|
"name": name,
|
||||||
|
"place_type": place_type,
|
||||||
|
"lat": float(row["lat"]),
|
||||||
|
"lon": float(row["lng"]),
|
||||||
|
"population": 0,
|
||||||
|
"travel_destination": False,
|
||||||
|
"display_city": None,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return places
|
||||||
|
|
||||||
|
|
||||||
|
def _append_high_value_pois(places: list[dict], pois_path: Path) -> int:
|
||||||
|
pois = pl.read_parquet(pois_path, columns=["name", "category", "lat", "lng"])
|
||||||
|
new_places = _pois_to_places(pois)
|
||||||
|
existing = {
|
||||||
|
(str(place["name"]).lower(), place["place_type"]) for place in places
|
||||||
|
}
|
||||||
|
added = 0
|
||||||
|
for place in new_places:
|
||||||
|
key = (place["name"].lower(), place["place_type"])
|
||||||
|
if key in existing:
|
||||||
|
continue
|
||||||
|
places.append(place)
|
||||||
|
existing.add(key)
|
||||||
|
added += 1
|
||||||
|
return added
|
||||||
|
|
||||||
|
|
||||||
def _postcode_lookup(postcodes_path: Path) -> dict[str, tuple[float, float]]:
|
def _postcode_lookup(postcodes_path: Path) -> dict[str, tuple[float, float]]:
|
||||||
df = pl.read_parquet(
|
df = pl.read_parquet(
|
||||||
postcodes_path,
|
postcodes_path,
|
||||||
|
|
@ -266,6 +434,19 @@ def _display_city_from_tags(tags: dict[str, str]) -> str | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_population(pop_str: str) -> int:
|
||||||
|
"""Robustly parse OSM population tags that may carry grouping separators,
|
||||||
|
decimals, or surrounding text ("12,345", "5 000", "12345.0", "approx 5000").
|
||||||
|
"""
|
||||||
|
# Take the integer part before any decimal point, then the first run of
|
||||||
|
# digits ignoring grouping separators (commas/spaces) and other annotations.
|
||||||
|
match = re.search(r"\d[\d,\s]*", pop_str.split(".", 1)[0])
|
||||||
|
if match is None:
|
||||||
|
return 0
|
||||||
|
digits = re.sub(r"\D", "", match.group(0))
|
||||||
|
return int(digits) if digits else 0
|
||||||
|
|
||||||
|
|
||||||
def _is_london_admin_expr() -> pl.Expr:
|
def _is_london_admin_expr() -> pl.Expr:
|
||||||
return (
|
return (
|
||||||
(pl.col("rgn25cd") == LONDON_REGION_CODE)
|
(pl.col("rgn25cd") == LONDON_REGION_CODE)
|
||||||
|
|
@ -466,11 +647,15 @@ def _append_naptan_dlr_stations(places: list[dict], naptan_path: Path) -> int:
|
||||||
|
|
||||||
|
|
||||||
class PlaceHandler(osmium.SimpleHandler):
|
class PlaceHandler(osmium.SimpleHandler):
|
||||||
def __init__(self, progress: tqdm, england_polygon) -> None:
|
def __init__(
|
||||||
|
self, progress: tqdm, england_polygon, *, collect_streets: bool = False
|
||||||
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._progress = progress
|
self._progress = progress
|
||||||
self.places: list[dict] = []
|
self.places: list[dict] = []
|
||||||
|
self.streets: list[dict] = []
|
||||||
self._england = england_polygon
|
self._england = england_polygon
|
||||||
|
self._collect_streets = collect_streets
|
||||||
|
|
||||||
def _add(
|
def _add(
|
||||||
self,
|
self,
|
||||||
|
|
@ -513,11 +698,7 @@ class PlaceHandler(osmium.SimpleHandler):
|
||||||
if not name:
|
if not name:
|
||||||
return
|
return
|
||||||
|
|
||||||
pop_str = tags.get("population", "")
|
population = _parse_population(tags.get("population", ""))
|
||||||
try:
|
|
||||||
population = int(pop_str)
|
|
||||||
except ValueError:
|
|
||||||
population = 0
|
|
||||||
|
|
||||||
# place=* nodes
|
# place=* nodes
|
||||||
place_type = tags.get("place")
|
place_type = tags.get("place")
|
||||||
|
|
@ -551,6 +732,39 @@ class PlaceHandler(osmium.SimpleHandler):
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def way(self, w: osmium.osm.Way) -> None:
|
||||||
|
"""Collect named, searchable highways as raw segments (grouped into streets later)."""
|
||||||
|
if not self._collect_streets:
|
||||||
|
return
|
||||||
|
self._progress.update(1)
|
||||||
|
if w.tags.get("highway") not in SEARCHABLE_HIGHWAY_TYPES:
|
||||||
|
return
|
||||||
|
name = w.tags.get("name:en", w.tags.get("name", ""))
|
||||||
|
if not name:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Way node refs expose resolved .lat/.lon directly (locations=True); accessing them
|
||||||
|
# raises InvalidLocationError when a node's location is missing from the index.
|
||||||
|
coords: list[tuple[float, float]] = []
|
||||||
|
for node in w.nodes:
|
||||||
|
try:
|
||||||
|
coords.append((node.lat, node.lon))
|
||||||
|
except osmium.InvalidLocationError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
centroid = _street_centroid(coords)
|
||||||
|
if centroid is None:
|
||||||
|
return
|
||||||
|
lat, lon = centroid
|
||||||
|
if not (
|
||||||
|
ENGLAND_BBOX_SOUTH <= lat <= ENGLAND_BBOX_NORTH
|
||||||
|
and ENGLAND_BBOX_WEST <= lon <= ENGLAND_BBOX_EAST
|
||||||
|
):
|
||||||
|
return
|
||||||
|
if not self._england.contains(Point(lon, lat)):
|
||||||
|
return
|
||||||
|
self.streets.append({"name": name, "lat": lat, "lon": lon})
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
parser = argparse.ArgumentParser(description="Extract place names from OSM PBF")
|
parser = argparse.ArgumentParser(description="Extract place names from OSM PBF")
|
||||||
|
|
@ -578,15 +792,28 @@ def main() -> None:
|
||||||
"--postcodes",
|
"--postcodes",
|
||||||
type=Path,
|
type=Path,
|
||||||
help=(
|
help=(
|
||||||
"Postcode parquet used to geocode OfS university contact postcodes "
|
"Postcode parquet used to geocode OfS university contact postcodes, assign "
|
||||||
"and assign Greater London display labels"
|
"Greater London display labels, and tag streets with their outcode"
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--pois",
|
||||||
|
type=Path,
|
||||||
|
help="Optional uk_pois.parquet; high-value named POIs are added to the gazetteer",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--include-streets",
|
||||||
|
action="store_true",
|
||||||
|
help="Extract named highways as searchable streets (requires --postcodes)",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
pbf_file = args.pbf
|
pbf_file = args.pbf
|
||||||
england_polygon = load_england_polygon(args.boundary)
|
england_polygon = load_england_polygon(args.boundary)
|
||||||
|
|
||||||
|
if args.include_streets and not args.postcodes:
|
||||||
|
raise ValueError("--postcodes is required with --include-streets")
|
||||||
|
|
||||||
print("Extracting search place nodes + railway stations")
|
print("Extracting search place nodes + railway stations")
|
||||||
with tqdm(
|
with tqdm(
|
||||||
unit=" elements",
|
unit=" elements",
|
||||||
|
|
@ -595,10 +822,21 @@ def main() -> None:
|
||||||
smoothing=0.05,
|
smoothing=0.05,
|
||||||
mininterval=1.0,
|
mininterval=1.0,
|
||||||
) as progress:
|
) as progress:
|
||||||
handler = PlaceHandler(progress, england_polygon)
|
handler = PlaceHandler(
|
||||||
|
progress, england_polygon, collect_streets=args.include_streets
|
||||||
|
)
|
||||||
handler.apply_file(str(pbf_file), locations=True)
|
handler.apply_file(str(pbf_file), locations=True)
|
||||||
|
|
||||||
print(f"Extracted {len(handler.places):,} place nodes")
|
print(f"Extracted {len(handler.places):,} place nodes")
|
||||||
|
if args.include_streets:
|
||||||
|
print(f"Collected {len(handler.streets):,} named street segments")
|
||||||
|
tree, outcodes = _outcode_tree(args.postcodes)
|
||||||
|
street_places = _build_street_places(handler.streets, tree, outcodes)
|
||||||
|
handler.places.extend(street_places)
|
||||||
|
print(f"Added {len(street_places):,} grouped streets")
|
||||||
|
if args.pois:
|
||||||
|
added = _append_high_value_pois(handler.places, args.pois)
|
||||||
|
print(f"Added {added:,} high-value POIs from {args.pois}")
|
||||||
if args.naptan:
|
if args.naptan:
|
||||||
added = _append_naptan_dlr_stations(handler.places, args.naptan)
|
added = _append_naptan_dlr_stations(handler.places, args.naptan)
|
||||||
print(f"Added {added:,} DLR station destinations from NaPTAN")
|
print(f"Added {added:,} DLR station destinations from NaPTAN")
|
||||||
|
|
|
||||||
|
|
@ -89,3 +89,92 @@ def test_deduplicate_naptan_does_not_merge_missing_locality_bus_stops():
|
||||||
result = deduplicate_naptan(df)
|
result = deduplicate_naptan(df)
|
||||||
|
|
||||||
assert len(result) == 2
|
assert len(result) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_deduplicate_naptan_merges_colocated_missing_locality_bus_stations():
|
||||||
|
# Two NaPTAN records for the same bus station with no locality, co-located
|
||||||
|
# within the merge area, are a true duplicate and collapse to one POI.
|
||||||
|
df = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"id": ["a", "b"],
|
||||||
|
"name": ["Victoria Bus Station", "Victoria Bus Station"],
|
||||||
|
"category": ["Bus station", "Bus station"],
|
||||||
|
"lat": [51.4952, 51.4953],
|
||||||
|
"lng": [-0.1441, -0.1440],
|
||||||
|
"locality": [None, None],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
result = deduplicate_naptan(df)
|
||||||
|
|
||||||
|
assert len(result) == 1
|
||||||
|
assert result["name"][0] == "Victoria Bus Station"
|
||||||
|
assert result["category"][0] == "Bus station"
|
||||||
|
assert result["lat"][0] == pytest.approx((51.4952 + 51.4953) / 2)
|
||||||
|
|
||||||
|
|
||||||
|
def test_deduplicate_naptan_keeps_rail_station_with_only_station_node():
|
||||||
|
# Aberdare's only NaPTAN record is an RLY station node (StopType "RLY").
|
||||||
|
df = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"id": ["aberdare-rly"],
|
||||||
|
"name": ["Aberdare Rail Station"],
|
||||||
|
"category": ["Rail station"],
|
||||||
|
"lat": [51.7155],
|
||||||
|
"lng": [-3.4438],
|
||||||
|
"locality": ["ABERDARE"],
|
||||||
|
"entrance": [False],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
result = deduplicate_naptan(df)
|
||||||
|
|
||||||
|
assert len(result) == 1
|
||||||
|
assert result["name"][0] == "Aberdare Rail Station"
|
||||||
|
assert result["category"][0] == "Rail station"
|
||||||
|
|
||||||
|
|
||||||
|
def test_deduplicate_naptan_merges_rail_entrances_into_station_node():
|
||||||
|
# A station node (RLY) and its two entrance nodes (RSE) collapse to a single
|
||||||
|
# "Rail station" POI represented by the station node, not an entrance.
|
||||||
|
df = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"id": ["clapham-rly", "clapham-rse-a", "clapham-rse-b"],
|
||||||
|
"name": [
|
||||||
|
"Clapham Junction Rail Station",
|
||||||
|
"Clapham Junction Rail Station",
|
||||||
|
"Clapham Junction Rail Station",
|
||||||
|
],
|
||||||
|
"category": ["Rail station", "Rail station", "Rail station"],
|
||||||
|
"lat": [51.4642, 51.4644, 51.4640],
|
||||||
|
"lng": [-0.1705, -0.1702, -0.1708],
|
||||||
|
"locality": ["CLAPHAM", "CLAPHAM", "CLAPHAM"],
|
||||||
|
"entrance": [False, True, True],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
result = deduplicate_naptan(df)
|
||||||
|
|
||||||
|
assert len(result) == 1
|
||||||
|
assert result["id"][0] == "clapham-rly"
|
||||||
|
assert result["category"][0] == "Rail station"
|
||||||
|
|
||||||
|
|
||||||
|
def test_deduplicate_naptan_does_not_merge_rail_and_ferry_in_same_area():
|
||||||
|
# Different transport modes sharing a name/area stay as separate POIs.
|
||||||
|
df = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"id": ["harbour-rail", "harbour-ferry"],
|
||||||
|
"name": ["Harbour Station", "Harbour Station"],
|
||||||
|
"category": ["Rail station", "Ferry"],
|
||||||
|
"lat": [51.5, 51.5001],
|
||||||
|
"lng": [-0.1, -0.1001],
|
||||||
|
"locality": ["HARBOUR", "HARBOUR"],
|
||||||
|
"entrance": [False, False],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
result = deduplicate_naptan(df).sort("category")
|
||||||
|
|
||||||
|
assert len(result) == 2
|
||||||
|
assert result["category"].to_list() == ["Ferry", "Rail station"]
|
||||||
|
|
|
||||||
|
|
@ -2,14 +2,32 @@ import httpx
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
import rasterio
|
import rasterio
|
||||||
|
from rasterio.io import MemoryFile
|
||||||
from rasterio.transform import from_origin
|
from rasterio.transform import from_origin
|
||||||
|
|
||||||
from pipeline.download import noise
|
from pipeline.download import noise
|
||||||
|
|
||||||
|
|
||||||
|
def _tiny_geotiff_bytes() -> bytes:
|
||||||
|
data = np.array([[55]], dtype=np.uint8)
|
||||||
|
with MemoryFile() as memfile:
|
||||||
|
with memfile.open(
|
||||||
|
driver="GTiff",
|
||||||
|
height=data.shape[0],
|
||||||
|
width=data.shape[1],
|
||||||
|
count=1,
|
||||||
|
dtype=data.dtype,
|
||||||
|
crs="EPSG:27700",
|
||||||
|
transform=from_origin(0, 1, 1, 1),
|
||||||
|
) as dataset:
|
||||||
|
dataset.write(data, 1)
|
||||||
|
return bytes(memfile.getbuffer())
|
||||||
|
|
||||||
|
|
||||||
def test_download_tile_splits_after_retries(monkeypatch, tmp_path):
|
def test_download_tile_splits_after_retries(monkeypatch, tmp_path):
|
||||||
monkeypatch.setattr(noise, "MAX_RETRIES", 1)
|
monkeypatch.setattr(noise, "MAX_RETRIES", 1)
|
||||||
monkeypatch.setattr(noise, "MIN_TILE_SIZE", 50)
|
monkeypatch.setattr(noise, "MIN_TILE_SIZE", 50)
|
||||||
|
tile_bytes = _tiny_geotiff_bytes()
|
||||||
|
|
||||||
def fake_fetch_tile_bytes(
|
def fake_fetch_tile_bytes(
|
||||||
wcs_base,
|
wcs_base,
|
||||||
|
|
@ -22,7 +40,7 @@ def test_download_tile_splits_after_retries(monkeypatch, tmp_path):
|
||||||
):
|
):
|
||||||
if max_e - min_e > 50 or max_n - min_n > 50:
|
if max_e - min_e > 50 or max_n - min_n > 50:
|
||||||
raise httpx.TimeoutException("too large")
|
raise httpx.TimeoutException("too large")
|
||||||
return b"II*\x00fake-tiff"
|
return tile_bytes
|
||||||
|
|
||||||
monkeypatch.setattr(noise, "_fetch_tile_bytes", fake_fetch_tile_bytes)
|
monkeypatch.setattr(noise, "_fetch_tile_bytes", fake_fetch_tile_bytes)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,15 +1,22 @@
|
||||||
|
import numpy as np
|
||||||
import polars as pl
|
import polars as pl
|
||||||
from pyproj import Transformer
|
from pyproj import Transformer
|
||||||
|
from scipy.spatial import cKDTree
|
||||||
|
|
||||||
from pipeline.download.places import (
|
from pipeline.download.places import (
|
||||||
_assign_london_display_city,
|
_assign_london_display_city,
|
||||||
|
_build_street_places,
|
||||||
_display_city_from_tags,
|
_display_city_from_tags,
|
||||||
_is_dlr_station,
|
_is_dlr_station,
|
||||||
_is_tram_station,
|
_is_tram_station,
|
||||||
_naptan_dlr_stations,
|
_naptan_dlr_stations,
|
||||||
|
_normalize_street_name,
|
||||||
_ofs_universities,
|
_ofs_universities,
|
||||||
|
_outcode_of_postcode,
|
||||||
|
_pois_to_places,
|
||||||
_select_university_name,
|
_select_university_name,
|
||||||
_station_display_name,
|
_station_display_name,
|
||||||
|
_street_centroid,
|
||||||
)
|
)
|
||||||
|
|
||||||
WGS84_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
|
WGS84_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
|
||||||
|
|
@ -168,6 +175,73 @@ def test_ofs_universities_extracts_university_title_rows_with_postcode_coords():
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_street_centroid_averages_vertices():
|
||||||
|
assert _street_centroid([(51.0, -0.1), (53.0, -0.3)]) == (52.0, -0.2)
|
||||||
|
assert _street_centroid([]) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_street_name_and_outcode():
|
||||||
|
assert _normalize_street_name(" High Street ") == "high street"
|
||||||
|
assert _outcode_of_postcode("NW1 6XE") == "NW1"
|
||||||
|
assert _outcode_of_postcode("") == ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_street_places_groups_segments_by_name_and_outcode():
|
||||||
|
# Two postcodes: NW1 (north) and CR0 (south).
|
||||||
|
tree = cKDTree(np.array([[51.53, -0.14], [51.37, -0.10]], dtype=np.float64))
|
||||||
|
outcodes = ["NW1", "CR0"]
|
||||||
|
|
||||||
|
streets = [
|
||||||
|
{"name": "High Street", "lat": 51.531, "lon": -0.141}, # NW1
|
||||||
|
{"name": "High Street", "lat": 51.529, "lon": -0.139}, # NW1 (same road, 2nd segment)
|
||||||
|
{"name": "High Street", "lat": 51.371, "lon": -0.101}, # CR0 (different road, same name)
|
||||||
|
{"name": "Baker Street", "lat": 51.5305, "lon": -0.1405}, # NW1
|
||||||
|
]
|
||||||
|
|
||||||
|
places = _build_street_places(streets, tree, outcodes)
|
||||||
|
|
||||||
|
# 3 distinct streets: High Street/NW1 (2 segments merged), High Street/CR0, Baker Street/NW1.
|
||||||
|
assert len(places) == 3
|
||||||
|
assert all(place["place_type"] == "street" for place in places)
|
||||||
|
|
||||||
|
nw1_high = next(
|
||||||
|
place
|
||||||
|
for place in places
|
||||||
|
if place["name"] == "High Street" and place["lat"] > 51.5
|
||||||
|
)
|
||||||
|
assert nw1_high["lat"] == (51.531 + 51.529) / 2
|
||||||
|
assert nw1_high["lon"] == (-0.141 + -0.139) / 2
|
||||||
|
# The same-named CR0 road stays separate.
|
||||||
|
assert any(
|
||||||
|
place["name"] == "High Street" and place["lat"] < 51.4 for place in places
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_pois_to_places_keeps_high_value_named_pois_only():
|
||||||
|
pois = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"name": ["Hyde Park", "St Thomas' Hospital", "Joe's Cafe", "Hyde Park", ""],
|
||||||
|
"category": [
|
||||||
|
"leisure/park",
|
||||||
|
"amenity/hospital",
|
||||||
|
"amenity/cafe", # everyday amenity → excluded
|
||||||
|
"leisure/park", # duplicate → excluded
|
||||||
|
"leisure/park", # empty name → excluded
|
||||||
|
],
|
||||||
|
"lat": [51.507, 51.498, 51.5, 51.507, 51.6],
|
||||||
|
"lng": [-0.165, -0.118, -0.1, -0.165, -0.2],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
places = _pois_to_places(pois)
|
||||||
|
|
||||||
|
assert [(place["name"], place["place_type"]) for place in places] == [
|
||||||
|
("Hyde Park", "park"),
|
||||||
|
("St Thomas' Hospital", "hospital"),
|
||||||
|
]
|
||||||
|
assert all(place["travel_destination"] is False for place in places)
|
||||||
|
|
||||||
|
|
||||||
def test_display_city_from_tags_uses_explicit_london_context():
|
def test_display_city_from_tags_uses_explicit_london_context():
|
||||||
assert _display_city_from_tags({"is_in": "Croydon, London, UK"}) == "London"
|
assert _display_city_from_tags({"is_in": "Croydon, London, UK"}) == "London"
|
||||||
assert _display_city_from_tags({"is_in": "Croydon, Cambridgeshire, UK"}) is None
|
assert _display_city_from_tags({"is_in": "Croydon, Cambridgeshire, UK"}) is None
|
||||||
|
|
|
||||||
|
|
@ -307,9 +307,14 @@ def convert_high_freq_to_frequency_based(
|
||||||
|
|
||||||
print(f" Found {len(trip_group_key)} trips on target routes")
|
print(f" Found {len(trip_group_key)} trips on target routes")
|
||||||
|
|
||||||
# Step 3: Get first departure time and first stop for each target trip
|
# Step 3: Get first departure time and first stop for each target trip.
|
||||||
|
# GTFS only requires stop_sequence to be strictly increasing per trip; it
|
||||||
|
# is NOT required to start at 0 (1-based numbering is common, and BODS is
|
||||||
|
# consumed raw here without renumbering). So pick the row with the minimum
|
||||||
|
# stop_sequence per trip rather than keying off the literal "0".
|
||||||
trip_first_dep: dict[str, int] = {}
|
trip_first_dep: dict[str, int] = {}
|
||||||
trip_first_stop: dict[str, str] = {}
|
trip_first_stop: dict[str, str] = {}
|
||||||
|
trip_min_seq: dict[str, int] = {}
|
||||||
with zin.open("stop_times.txt") as f:
|
with zin.open("stop_times.txt") as f:
|
||||||
cols = _parse_csv_line(f.readline())
|
cols = _parse_csv_line(f.readline())
|
||||||
trip_id_idx = cols.index("trip_id")
|
trip_id_idx = cols.index("trip_id")
|
||||||
|
|
@ -323,12 +328,26 @@ def convert_high_freq_to_frequency_based(
|
||||||
trip_id = parts[trip_id_idx].strip('"')
|
trip_id = parts[trip_id_idx].strip('"')
|
||||||
if trip_id not in trip_group_key:
|
if trip_id not in trip_group_key:
|
||||||
continue
|
continue
|
||||||
if parts[seq_idx].strip('"') == "0":
|
try:
|
||||||
|
seq = int(parts[seq_idx].strip('"'))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
if trip_id in trip_min_seq and seq >= trip_min_seq[trip_id]:
|
||||||
|
continue
|
||||||
dep_secs = _parse_gtfs_time(parts[dep_idx])
|
dep_secs = _parse_gtfs_time(parts[dep_idx])
|
||||||
if dep_secs is not None:
|
if dep_secs is None:
|
||||||
|
continue
|
||||||
|
trip_min_seq[trip_id] = seq
|
||||||
trip_first_dep[trip_id] = dep_secs
|
trip_first_dep[trip_id] = dep_secs
|
||||||
trip_first_stop[trip_id] = parts[stop_id_idx].strip('"')
|
trip_first_stop[trip_id] = parts[stop_id_idx].strip('"')
|
||||||
|
|
||||||
|
if trip_group_key and not trip_first_dep:
|
||||||
|
raise RuntimeError(
|
||||||
|
"convert_high_freq_to_frequency_based found no first stops for "
|
||||||
|
f"{len(trip_group_key)} target trips; stop_times.txt may be malformed "
|
||||||
|
"or stop_sequence parsing failed"
|
||||||
|
)
|
||||||
|
|
||||||
# Step 4: Group trips by (route, direction, service, first_stop) and compute headways
|
# Step 4: Group trips by (route, direction, service, first_stop) and compute headways
|
||||||
groups: dict[tuple[str, ...], list[tuple[str, int]]] = defaultdict(list)
|
groups: dict[tuple[str, ...], list[tuple[str, int]]] = defaultdict(list)
|
||||||
for trip_id, dep_secs in trip_first_dep.items():
|
for trip_id, dep_secs in trip_first_dep.items():
|
||||||
|
|
|
||||||
|
|
@ -95,11 +95,14 @@ def transform_crime(
|
||||||
f"({valid_months[0]} to {valid_months[-1]})"
|
f"({valid_months[0]} to {valid_months[-1]})"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Count monthly incidents, then annualise over every valid month in the dataset.
|
# Annualise each year separately (count_in_year * 12 / months_in_year), then
|
||||||
# `_weight` (≤1) comes from the LSOA 2011→2021 lookup: 2011 LSOAs that split
|
# take the simple mean of those per-year rates over the years each type is
|
||||||
# into N 2021 LSOAs contribute 1/N of their count to each child, since we
|
# present. This makes the headline equal the average of the by-year chart bars
|
||||||
# don't know which child a given incident actually belonged to.
|
# (_write_crime_by_year) instead of a month-weighted pooled rate, mirroring
|
||||||
yearly_counts = (
|
# crime_spatial._write_avg_yr. `_weight` (≤1) comes from the LSOA 2011→2021
|
||||||
|
# lookup: 2011 LSOAs that split into N 2021 LSOAs contribute 1/N of their count
|
||||||
|
# to each child, since we don't know which child an incident actually belonged to.
|
||||||
|
filtered = (
|
||||||
df.filter(
|
df.filter(
|
||||||
valid_month_expr
|
valid_month_expr
|
||||||
& pl.col("LSOA code").is_not_null()
|
& pl.col("LSOA code").is_not_null()
|
||||||
|
|
@ -107,15 +110,31 @@ def transform_crime(
|
||||||
& pl.col("Crime type").is_not_null()
|
& pl.col("Crime type").is_not_null()
|
||||||
& (pl.col("Crime type") != "")
|
& (pl.col("Crime type") != "")
|
||||||
)
|
)
|
||||||
.with_columns(pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES))
|
.with_columns(
|
||||||
.group_by("LSOA code", "Month", "Crime type")
|
pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"),
|
||||||
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
|
pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES),
|
||||||
.group_by("LSOA code", "Crime type")
|
|
||||||
.agg(
|
|
||||||
(pl.col("count").sum() / pl.lit(valid_month_count) * 12)
|
|
||||||
.round(1)
|
|
||||||
.alias("yearly_avg")
|
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Months observed *anywhere* in the dataset for each year (annualisation
|
||||||
|
# denominator), matching the by-year output's per-year scaling.
|
||||||
|
months_per_year = filtered.group_by("year").agg(
|
||||||
|
pl.col("Month").n_unique().alias("months_in_year")
|
||||||
|
)
|
||||||
|
|
||||||
|
yearly_counts = (
|
||||||
|
filtered.group_by("LSOA code", "year", "Crime type", "Month")
|
||||||
|
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
|
||||||
|
.group_by("LSOA code", "year", "Crime type")
|
||||||
|
.agg(pl.col("count").sum().alias("count"))
|
||||||
|
.join(months_per_year, on="year")
|
||||||
|
.with_columns(
|
||||||
|
(pl.col("count") * 12.0 / pl.col("months_in_year")).alias("per_year")
|
||||||
|
)
|
||||||
|
# Mean of the per-year annualised rates over the years the type is present
|
||||||
|
# (only years with rows are grouped here, so this is the correct x-span).
|
||||||
|
.group_by("LSOA code", "Crime type")
|
||||||
|
.agg(pl.col("per_year").mean().round(1).alias("yearly_avg"))
|
||||||
.collect(engine="streaming")
|
.collect(engine="streaming")
|
||||||
)
|
)
|
||||||
if yearly_counts.is_empty():
|
if yearly_counts.is_empty():
|
||||||
|
|
|
||||||
|
|
@ -259,11 +259,14 @@ def _write_avg_yr(
|
||||||
"""
|
"""
|
||||||
months = np.array([months_in_year[year] for year in years], dtype=np.float64)
|
months = np.array([months_in_year[year] for year in years], dtype=np.float64)
|
||||||
per_year = counts.astype(np.float64) * 12.0 / months[None, None, :]
|
per_year = counts.astype(np.float64) * 12.0 / months[None, None, :]
|
||||||
# Average over the years each type is actually observed anywhere -- the same
|
# Average over the years *this postcode* actually has incidents of *this
|
||||||
# per-type x-span the by-year chart plots (server-rs/.../crime_by_year.rs).
|
# type* -- the same per-(postcode, type) x-span the by-year chart plots
|
||||||
type_year_present = counts.sum(axis=0) > 0 # (n_types, n_years)
|
# (server-rs/.../crime_by_year.rs), so the headline equals the mean of the
|
||||||
years_per_type = np.clip(type_year_present.sum(axis=1), 1, None).astype(np.float64)
|
# by-year bars. Dividing by a global years-present count (years a type
|
||||||
avg = per_year.sum(axis=2) / years_per_type[None, :] # (n_postcodes, n_types)
|
# appeared anywhere in England) would deflate postcodes whose incidents
|
||||||
|
# cluster in only a few years of the ~13-year window.
|
||||||
|
years_present = np.clip((counts > 0).sum(axis=2), 1, None).astype(np.float64)
|
||||||
|
avg = per_year.sum(axis=2) / years_present # (n_postcodes, n_types)
|
||||||
avg = np.round(avg * norm[:, None], 1).astype(np.float32)
|
avg = np.round(avg * norm[:, None], 1).astype(np.float32)
|
||||||
|
|
||||||
data: dict[str, np.ndarray] = {"postcode": postcodes}
|
data: dict[str, np.ndarray] = {"postcode": postcodes}
|
||||||
|
|
|
||||||
|
|
@ -365,6 +365,16 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
||||||
}
|
}
|
||||||
duration_map = {"F": "Freehold", "L": "Leasehold"}
|
duration_map = {"F": "Freehold", "L": "Leasehold"}
|
||||||
|
|
||||||
|
# price >= MIN_PRICE and ppd_category == "A" (standard open-market sale) are
|
||||||
|
# VALUE-QUALITY filters: they gate the price aggregations only. Category B
|
||||||
|
# entries (repossessions, bulk/portfolio, power-of-sale transfers) and sub-MIN
|
||||||
|
# sales must not pollute latest_price / historical_prices (and the downstream
|
||||||
|
# price-per-sqm feature), but they MUST still count for first_transfer_date /
|
||||||
|
# old_new so a new-build's genuine earliest transfer year is preserved.
|
||||||
|
price_ok = pl.col("price") >= MIN_PRICE
|
||||||
|
category_ok = pl.col("ppd_category") == "A"
|
||||||
|
quality_ok = price_ok & category_ok
|
||||||
|
|
||||||
price_paid = (
|
price_paid = (
|
||||||
pl.scan_parquet(price_paid_path)
|
pl.scan_parquet(price_paid_path)
|
||||||
.select(
|
.select(
|
||||||
|
|
@ -381,9 +391,9 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
||||||
"town_city",
|
"town_city",
|
||||||
pl.col("duration").replace(duration_map),
|
pl.col("duration").replace(duration_map),
|
||||||
"old_new",
|
"old_new",
|
||||||
|
"ppd_category",
|
||||||
)
|
)
|
||||||
.filter(pl.col("pp_property_type") != "Other")
|
.filter(pl.col("pp_property_type") != "Other")
|
||||||
.filter(pl.col("price") >= MIN_PRICE)
|
|
||||||
.with_columns(
|
.with_columns(
|
||||||
pl.concat_str(
|
pl.concat_str(
|
||||||
[pl.col("saon"), pl.col("paon"), pl.col("street")],
|
[pl.col("saon"), pl.col("paon"), pl.col("street")],
|
||||||
|
|
@ -408,18 +418,26 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
||||||
pl.col("postcode").last(),
|
pl.col("postcode").last(),
|
||||||
pl.col("_pp_match_address").last(),
|
pl.col("_pp_match_address").last(),
|
||||||
pl.col("_pp_match_postcode").last(),
|
pl.col("_pp_match_postcode").last(),
|
||||||
|
# Price aggregations are restricted to quality-passing sales.
|
||||||
pl.struct(
|
pl.struct(
|
||||||
pl.col("date_of_transfer").dt.year().alias("year"),
|
pl.col("date_of_transfer").dt.year().alias("year"),
|
||||||
pl.col("date_of_transfer").dt.month().cast(pl.UInt8).alias("month"),
|
pl.col("date_of_transfer").dt.month().cast(pl.UInt8).alias("month"),
|
||||||
"price",
|
"price",
|
||||||
).alias("historical_prices"),
|
)
|
||||||
|
.filter(quality_ok)
|
||||||
|
.alias("historical_prices"),
|
||||||
pl.col("pp_property_type").last(),
|
pl.col("pp_property_type").last(),
|
||||||
pl.col("duration").last(),
|
pl.col("duration").last(),
|
||||||
pl.col("price").last().alias("latest_price"),
|
pl.col("price").filter(quality_ok).last().alias("latest_price"),
|
||||||
pl.col("date_of_transfer").last(),
|
pl.col("date_of_transfer").filter(quality_ok).last(),
|
||||||
|
# first_transfer_date / old_new reflect the genuine earliest transfer
|
||||||
|
# over the full per-group transaction stream (not value-filtered).
|
||||||
pl.col("date_of_transfer").first().alias("first_transfer_date"),
|
pl.col("date_of_transfer").first().alias("first_transfer_date"),
|
||||||
pl.col("old_new").first(),
|
pl.col("old_new").first(),
|
||||||
)
|
)
|
||||||
|
# Preserve the property universe: previously a property needed >=1 sale
|
||||||
|
# >=MIN_PRICE to form a group, so drop groups with no quality-passing sale.
|
||||||
|
.filter(pl.col("latest_price").is_not_null())
|
||||||
)
|
)
|
||||||
|
|
||||||
print("Price paid dataset")
|
print("Price paid dataset")
|
||||||
|
|
|
||||||
|
|
@ -839,14 +839,13 @@ def _join_area_side_tables(
|
||||||
# Crime is counted spatially per postcode (incidents within 50m of the
|
# Crime is counted spatially per postcode (incidents within 50m of the
|
||||||
# postcode boundary), so it joins on postcode rather than LSOA.
|
# postcode boundary), so it joins on postcode rather than LSOA.
|
||||||
base = base.join(crime, on="postcode", how="left")
|
base = base.join(crime, on="postcode", how="left")
|
||||||
base = base.with_columns(
|
serious_crime_cols = [
|
||||||
pl.sum_horizontal(
|
|
||||||
"Violence and sexual offences (avg/yr)",
|
"Violence and sexual offences (avg/yr)",
|
||||||
"Robbery (avg/yr)",
|
"Robbery (avg/yr)",
|
||||||
"Burglary (avg/yr)",
|
"Burglary (avg/yr)",
|
||||||
"Possession of weapons (avg/yr)",
|
"Possession of weapons (avg/yr)",
|
||||||
).alias("serious_crime_avg_yr"),
|
]
|
||||||
pl.sum_horizontal(
|
minor_crime_cols = [
|
||||||
"Anti-social behaviour (avg/yr)",
|
"Anti-social behaviour (avg/yr)",
|
||||||
"Criminal damage and arson (avg/yr)",
|
"Criminal damage and arson (avg/yr)",
|
||||||
"Shoplifting (avg/yr)",
|
"Shoplifting (avg/yr)",
|
||||||
|
|
@ -857,7 +856,19 @@ def _join_area_side_tables(
|
||||||
"Public order (avg/yr)",
|
"Public order (avg/yr)",
|
||||||
"Drugs (avg/yr)",
|
"Drugs (avg/yr)",
|
||||||
"Other crime (avg/yr)",
|
"Other crime (avg/yr)",
|
||||||
).alias("minor_crime_avg_yr"),
|
]
|
||||||
|
# The LEFT join leaves every per-type column null for postcodes absent from
|
||||||
|
# the crime table; sum_horizontal alone would fabricate a "zero crime"
|
||||||
|
# rollup there, so keep the rollup null when ALL components are null.
|
||||||
|
base = base.with_columns(
|
||||||
|
pl.when(pl.all_horizontal([pl.col(c).is_null() for c in serious_crime_cols]))
|
||||||
|
.then(None)
|
||||||
|
.otherwise(pl.sum_horizontal(serious_crime_cols))
|
||||||
|
.alias("serious_crime_avg_yr"),
|
||||||
|
pl.when(pl.all_horizontal([pl.col(c).is_null() for c in minor_crime_cols]))
|
||||||
|
.then(None)
|
||||||
|
.otherwise(pl.sum_horizontal(minor_crime_cols))
|
||||||
|
.alias("minor_crime_avg_yr"),
|
||||||
)
|
)
|
||||||
|
|
||||||
base = base.join(median_age, on="lsoa21", how="left")
|
base = base.join(median_age, on="lsoa21", how="left")
|
||||||
|
|
@ -1179,7 +1190,22 @@ def _load_listings_for_merge(listings_path: Path, arcgis_path: Path) -> pl.DataF
|
||||||
# pages); tolerate its absence so older parquets and test fixtures still
|
# pages); tolerate its absence so older parquets and test fixtures still
|
||||||
# load. Digits-only so it compares equal to the EPC register's UPRN.
|
# load. Digits-only so it compares equal to the EPC register's UPRN.
|
||||||
if "UPRN" in raw.collect_schema().names():
|
if "UPRN" in raw.collect_schema().names():
|
||||||
uprn_digits = pl.col("UPRN").cast(pl.Utf8).str.replace_all(r"\D", "")
|
# Mirror `_normalize_uprn` exactly so the listing key compares equal to
|
||||||
|
# the candidate-side key for every dtype. For a Float UPRN we must
|
||||||
|
# stringify via its integer form (100023336956.0 -> "100023336956"),
|
||||||
|
# otherwise stripping non-digits from "100023336956.0" yields a bogus
|
||||||
|
# trailing-zero key ("1000233369560") that never collides; and a
|
||||||
|
# non-integral float (e.g. 1.5) must be rejected rather than mangled.
|
||||||
|
uprn_col = pl.col("UPRN")
|
||||||
|
if raw.collect_schema()["UPRN"].is_float():
|
||||||
|
integral = uprn_col.cast(pl.Int64, strict=False)
|
||||||
|
uprn_digits = (
|
||||||
|
pl.when(integral == uprn_col)
|
||||||
|
.then(integral.cast(pl.Utf8).str.replace_all(r"\D", ""))
|
||||||
|
.otherwise(None)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
uprn_digits = uprn_col.cast(pl.Utf8).str.replace_all(r"\D", "")
|
||||||
listing_uprn_expr = (
|
listing_uprn_expr = (
|
||||||
pl.when(uprn_digits.str.len_chars() > 0)
|
pl.when(uprn_digits.str.len_chars() > 0)
|
||||||
.then(uprn_digits)
|
.then(uprn_digits)
|
||||||
|
|
@ -1615,9 +1641,23 @@ def _enrich_listings_with_direct_epc(
|
||||||
|
|
||||||
|
|
||||||
def _coalesce_direct_epc_columns(wide: pl.LazyFrame) -> pl.LazyFrame:
|
def _coalesce_direct_epc_columns(wide: pl.LazyFrame) -> pl.LazyFrame:
|
||||||
|
def _coalesced(raw_column: str, direct_column: str) -> pl.Expr:
|
||||||
|
coalesce = pl.coalesce(pl.col(raw_column), pl.col(direct_column))
|
||||||
|
# The raw property-level value is fill_null("No") upstream, so a plain
|
||||||
|
# coalesce lets a non-null "No" override a directly-matched listing
|
||||||
|
# "Yes". "Former council house" should fire if EITHER side says so.
|
||||||
|
if raw_column == "was_council_house":
|
||||||
|
return (
|
||||||
|
pl.when((pl.col(raw_column) == "Yes") | (pl.col(direct_column) == "Yes"))
|
||||||
|
.then(pl.lit("Yes"))
|
||||||
|
.otherwise(coalesce)
|
||||||
|
.alias(raw_column)
|
||||||
|
)
|
||||||
|
return coalesce.alias(raw_column)
|
||||||
|
|
||||||
return wide.with_columns(
|
return wide.with_columns(
|
||||||
[
|
[
|
||||||
pl.coalesce(pl.col(raw_column), pl.col(direct_column)).alias(raw_column)
|
_coalesced(raw_column, direct_column)
|
||||||
for raw_column, direct_column in _DIRECT_EPC_RAW_COLUMN_MAP.items()
|
for raw_column, direct_column in _DIRECT_EPC_RAW_COLUMN_MAP.items()
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -12,11 +12,19 @@ from pipeline.utils.poi_counts import count_pois_per_postcode, min_distance_per_
|
||||||
|
|
||||||
# POI category groups for proximity counting (2km radius).
|
# POI category groups for proximity counting (2km radius).
|
||||||
# Names must match the friendly names produced by transform_poi.py / naptan.py.
|
# Names must match the friendly names produced by transform_poi.py / naptan.py.
|
||||||
|
# "groceries" is filled in dynamically by _groceries_categories() because the
|
||||||
|
# GEOLYTIX dataset stores the brand (e.g. "Tesco", "Aldi") in `category` rather
|
||||||
|
# than the literal "Supermarket"; counting only the OSM strings here severely
|
||||||
|
# understates the metric. See _groceries_categories below.
|
||||||
POI_GROUPS_2KM = {
|
POI_GROUPS_2KM = {
|
||||||
"restaurants": ["Restaurant", "Fast Food"],
|
"restaurants": ["Restaurant", "Fast Food"],
|
||||||
"groceries": ["Greengrocer", "Supermarket", "Convenience Store"],
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# POI group whose members are counted for the static "groceries" 2km metric.
|
||||||
|
# Covers both the OSM grocery categories (Supermarket, Convenience Store,
|
||||||
|
# Greengrocer, ...) and the GEOLYTIX brand categories (Tesco, Aldi, ...).
|
||||||
|
GROCERIES_GROUP = "Groceries"
|
||||||
|
|
||||||
# OS Open Greenspace function types used for park counts and distance calculation.
|
# OS Open Greenspace function types used for park counts and distance calculation.
|
||||||
# Uses the authoritative OS dataset instead of OSM point POIs for better coverage
|
# Uses the authoritative OS dataset instead of OSM point POIs for better coverage
|
||||||
# of green spaces that are only mapped as polygons in OSM.
|
# of green spaces that are only mapped as polygons in OSM.
|
||||||
|
|
@ -41,6 +49,26 @@ def _poi_category_slug(category: str) -> str:
|
||||||
return slug or "poi"
|
return slug or "poi"
|
||||||
|
|
||||||
|
|
||||||
|
def _groceries_categories(pois: pl.DataFrame) -> list[str]:
|
||||||
|
"""Return the distinct `category` values for the Groceries group.
|
||||||
|
|
||||||
|
`count_pois_per_postcode` matches POIs on `category`, but the authoritative
|
||||||
|
GEOLYTIX grocery dataset stores the brand name there (e.g. "Tesco", "Aldi")
|
||||||
|
with group "Groceries"; it never emits the literal "Supermarket". Collecting
|
||||||
|
every Groceries category captures both the OSM strings and the brand names.
|
||||||
|
"""
|
||||||
|
if "group" not in pois.columns:
|
||||||
|
raise ValueError("POI dataframe must include a 'group' column")
|
||||||
|
return (
|
||||||
|
pois.filter(pl.col("group") == GROCERIES_GROUP)
|
||||||
|
.select("category")
|
||||||
|
.unique()
|
||||||
|
.sort("category")
|
||||||
|
.to_series()
|
||||||
|
.to_list()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _build_poi_category_groups(
|
def _build_poi_category_groups(
|
||||||
pois: pl.DataFrame,
|
pois: pl.DataFrame,
|
||||||
) -> tuple[dict[str, list[str]], dict[str, str]]:
|
) -> tuple[dict[str, list[str]], dict[str, str]]:
|
||||||
|
|
@ -122,9 +150,15 @@ def main():
|
||||||
pois = pl.read_parquet(args.pois)
|
pois = pl.read_parquet(args.pois)
|
||||||
poi_category_groups, poi_display_names = _build_poi_category_groups(pois)
|
poi_category_groups, poi_display_names = _build_poi_category_groups(pois)
|
||||||
|
|
||||||
# Count static amenity groups within 2km.
|
# Count static amenity groups within 2km. "groceries" is matched against
|
||||||
|
# every Groceries category (OSM strings + GEOLYTIX brand names) so that
|
||||||
|
# postcodes ringed by GEOLYTIX-only chains (Tesco, Aldi, ...) are counted.
|
||||||
|
groups_2km = {
|
||||||
|
**POI_GROUPS_2KM,
|
||||||
|
"groceries": _groceries_categories(pois),
|
||||||
|
}
|
||||||
counts_2km = count_pois_per_postcode(
|
counts_2km = count_pois_per_postcode(
|
||||||
postcodes, pois, groups=POI_GROUPS_2KM, radius_km=2
|
postcodes, pois, groups=groups_2km, radius_km=2
|
||||||
)
|
)
|
||||||
|
|
||||||
# Dynamic amenity filters: nearest distance plus counts within 2km and 5km for
|
# Dynamic amenity filters: nearest distance plus counts within 2km and 5km for
|
||||||
|
|
|
||||||
|
|
@ -77,9 +77,9 @@ The output of `process_oa` is `list[(postcode, polygon)]` — the per-OA fragmen
|
||||||
|
|
||||||
### Phase 4: Merging and writing
|
### Phase 4: Merging and writing
|
||||||
|
|
||||||
**Fragment merging** (`output.py:merge_fragments`): Groups all fragments by postcode, unions them. If the result is a MultiPolygon (meaning the postcode has disconnected pieces — either from spanning OAs with a gap, or algorithm artifacts), applies a 5m buffer-then-unbuffer to close tiny gaps from floating-point mismatches at OA boundary edges. If still a MultiPolygon after that, keeps only the largest polygon — postcodes are contiguous delivery routes, so detached fragments are artifacts.
|
**Fragment merging** (`output.py:merge_fragments`): Groups all fragments by postcode, unions them. If the result is a MultiPolygon (meaning the postcode has disconnected pieces — either from spanning OAs with a gap, or algorithm artifacts), applies a 5m buffer-then-unbuffer to close tiny gaps from floating-point mismatches at OA boundary edges. If still a MultiPolygon after that, keeps the largest part **plus any other part ≥ `_MIN_DETACHED_PART_AREA` (100 m²)** (`_keep_polygon_parts`); only sub-100 m² noise slivers are dropped. Keeping substantial detached parts matters because a postcode genuinely split across an OA seam (by a railway, river, or main road wider than the 5m buffer) would otherwise lose a chunk — measured at ~1.8% of merged area left as uncovered gaps (often 3000–5000 m² building blocks) before this change.
|
||||||
|
|
||||||
**GeoJSON output** (`output.py:write_district_geojson`): Groups postcodes by district (the outward code, e.g. `SW1A` from `SW1A 1AA`). For each district, converts every postcode polygon from BNG to WGS84 using pyproj, simplifies with 1m tolerance (Douglas-Peucker), rounds coordinates to 6 decimal places (~0.1m precision), and writes a single `{district}.geojson` FeatureCollection. Each Feature has `postcodes` (formatted like `"SW1A 1AA"`) and `mapit_code` (no space: `"SW1A1AA"`) in its properties.
|
**GeoJSON output** (`output.py:write_district_geojson`): Two passes. Pass 1 converts every postcode from BNG to WGS84 (pyproj), simplifies with 1m tolerance (Douglas-Peucker), and snaps to 6 decimal places (~0.1m precision); multi-part postcodes become `MultiPolygon` (`to_wgs84_geojson_multi`, each part handled independently), single-part stay `Polygon`. The whole set is then made a **partition** (`_resolve_overlaps`): each postcode is trimmed by the union of its higher-priority overlapping neighbours, where **priority = ascending area** (smaller postcodes win contested ground). That single rule handles both seam overlap *and* containment — an enclosed postcode is always smaller than its container, so it keeps its area while the container gets a hole (the query uses both the `overlaps` and `contains` predicates, since `overlaps` alone excludes containment). This runs last, so nothing re-introduces overlap; a postcode that would be emptied keeps its original geometry, so no active postcode is dropped. Pass 2 groups postcodes by district (the outward code, e.g. `SW1A` from `SW1A 1AA`), rounds coordinates to 6dp, and writes a `{district}.geojson` FeatureCollection. Each Feature has `postcodes` (formatted like `"SW1A 1AA"`) and `mapit_code` (no space: `"SW1A1AA"`) in its properties.
|
||||||
|
|
||||||
## Memory architecture
|
## Memory architecture
|
||||||
|
|
||||||
|
|
@ -103,10 +103,10 @@ Key design choices:
|
||||||
|
|
||||||
## Key invariants
|
## Key invariants
|
||||||
|
|
||||||
1. **Every square meter of every OA is assigned to exactly one postcode** — the combination of INSPIRE claiming + Voronoi fills the entire OA, and overlap resolution ensures no double-counting
|
1. **No two postcodes cover the same ground in the output** — within an OA the INSPIRE claiming + Voronoi tile it with no overlap, and a final `_resolve_overlaps` partition pass removes the thin overlap strips that the merge buffer + per-postcode simplification introduce across OA seams (measured residual overlap ~0.01% of area)
|
||||||
2. **Every postcode that exists in the UPRN data gets a polygon** — unless all its UPRNs share coordinates with another postcode's UPRNs (handled by jitter) or it has zero UPRNs
|
2. **Every postcode that exists in the UPRN data gets a polygon** — unless all its UPRNs share coordinates with another postcode's UPRNs (handled by jitter) or it has zero UPRNs
|
||||||
3. **Postcode polygons never extend outside their OA(s)** — all geometry is clipped to OA boundaries
|
3. **Postcode polygons never extend outside their OA(s)** — all geometry is clipped to OA boundaries
|
||||||
4. **Output is always single Polygon, never MultiPolygon** — the largest-polygon extraction in both `merge_fragments` and `to_wgs84_geojson` ensures this
|
4. **A postcode split across an OA seam keeps all its substantial parts** — `merge_fragments` keeps every part ≥ 100 m² and the output is emitted as a `MultiPolygon` (the Rust server `postcodes.rs` and `loader.py` both parse MultiPolygon); only sub-100 m² noise slivers are dropped
|
||||||
|
|
||||||
## Module structure
|
## Module structure
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,21 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
import multiprocessing as mp
|
||||||
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import shapely
|
||||||
from shapely.geometry import MultiPolygon, Polygon
|
from shapely.geometry import MultiPolygon, Polygon
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from .fragments_cache import (
|
||||||
|
fragments_cache_is_fresh,
|
||||||
|
load_fragments,
|
||||||
|
save_fragments,
|
||||||
|
)
|
||||||
from .inspire import (
|
from .inspire import (
|
||||||
|
build_inspire_index,
|
||||||
cache_inspire,
|
cache_inspire,
|
||||||
get_inspire_candidates,
|
|
||||||
inspire_cache_exists,
|
inspire_cache_exists,
|
||||||
load_inspire,
|
load_inspire,
|
||||||
)
|
)
|
||||||
|
|
@ -14,7 +23,206 @@ from .memory import release_memory
|
||||||
from .oa_boundaries import load_oa_boundaries
|
from .oa_boundaries import load_oa_boundaries
|
||||||
from .output import merge_fragments, write_district_geojson
|
from .output import merge_fragments, write_district_geojson
|
||||||
from .process_oa import process_oa
|
from .process_oa import process_oa
|
||||||
from .uprn import get_oa_uprns, load_uprns
|
from .uprn import extract_uprn_arrays, get_oa_uprns_arrays, load_uprns
|
||||||
|
|
||||||
|
Fragment = tuple[str, Polygon | MultiPolygon]
|
||||||
|
|
||||||
|
|
||||||
|
def _oa_fragments(
|
||||||
|
oa_code, oa_geoms, east, north, postcodes_arr, offsets, index
|
||||||
|
) -> tuple[list[Fragment], bool]:
|
||||||
|
"""Process one OA into ``(postcode, geometry)`` fragments.
|
||||||
|
|
||||||
|
Returns ``(fragments, is_single)``; ``is_single`` flags the single-postcode
|
||||||
|
fast path. Shared by the sequential and parallel drivers so both produce
|
||||||
|
identical output. Any failure is re-raised tagged with the OA code so a single
|
||||||
|
bad OA is attributable instead of an anonymous worker abort hours in.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
oa_geom = oa_geoms[oa_code]
|
||||||
|
points, postcodes = get_oa_uprns_arrays(
|
||||||
|
east, north, postcodes_arr, offsets, oa_code
|
||||||
|
)
|
||||||
|
if len(set(postcodes)) == 1:
|
||||||
|
return [(postcodes[0], oa_geom)], True
|
||||||
|
candidates = index.candidates(oa_geom.bounds)
|
||||||
|
return process_oa(oa_geom, points, postcodes, candidates), False
|
||||||
|
except Exception as exc:
|
||||||
|
raise RuntimeError(f"Failed processing OA {oa_code}: {exc!r}") from exc
|
||||||
|
|
||||||
|
|
||||||
|
# Worker-shared state. Populated in the parent before the pool forks; children
|
||||||
|
# inherit it copy-on-write (the numpy/Arrow buffers + coords mmap stay shared,
|
||||||
|
# never duplicated per worker). Read-only in workers.
|
||||||
|
_WORKER_STATE: dict = {}
|
||||||
|
|
||||||
|
|
||||||
|
def _process_oa_chunk(oa_codes: list[str]):
|
||||||
|
"""Worker: turn a chunk of OA codes into WKB-encoded fragments.
|
||||||
|
|
||||||
|
Geometries are returned as WKB (compact and lossless) rather than pickled
|
||||||
|
Shapely objects, to keep the IPC payload small.
|
||||||
|
"""
|
||||||
|
state = _WORKER_STATE
|
||||||
|
frags: list[Fragment] = []
|
||||||
|
single = 0
|
||||||
|
for oa_code in oa_codes:
|
||||||
|
oa_frags, is_single = _oa_fragments(
|
||||||
|
oa_code,
|
||||||
|
state["oa_geoms"],
|
||||||
|
state["east"],
|
||||||
|
state["north"],
|
||||||
|
state["postcodes"],
|
||||||
|
state["offsets"],
|
||||||
|
state["index"],
|
||||||
|
)
|
||||||
|
frags.extend(oa_frags)
|
||||||
|
single += is_single
|
||||||
|
|
||||||
|
if frags:
|
||||||
|
pcs = [pc for pc, _ in frags]
|
||||||
|
wkb = shapely.to_wkb(np.array([g for _, g in frags], dtype=object))
|
||||||
|
else:
|
||||||
|
pcs, wkb = [], np.empty(0, dtype=object)
|
||||||
|
return pcs, wkb, single, len(oa_codes)
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_workers(requested: int) -> int:
|
||||||
|
"""Worker count: the explicit value if >0, otherwise all available CPUs."""
|
||||||
|
if requested and requested > 0:
|
||||||
|
return requested
|
||||||
|
try:
|
||||||
|
return max(1, len(os.sched_getaffinity(0)))
|
||||||
|
except AttributeError:
|
||||||
|
return max(1, os.cpu_count() or 1)
|
||||||
|
|
||||||
|
|
||||||
|
def _process_oas(
|
||||||
|
oa_codes, oa_geoms, east, north, postcodes_arr, offsets, index, workers
|
||||||
|
) -> tuple[list[Fragment], int]:
|
||||||
|
"""Drive Phase 3 over every OA, fanning out across `workers` processes.
|
||||||
|
|
||||||
|
OAs are independent, so the loop parallelises cleanly. ``fork`` lets workers
|
||||||
|
share the big read-only inputs (INSPIRE arrays + coords mmap, UPRN arrays, OA
|
||||||
|
geometries) copy-on-write instead of duplicating ~2GB each. Fragment order
|
||||||
|
does not affect the result (``merge_fragments`` unions per postcode), so
|
||||||
|
chunks are collected as they finish. Returns ``(fragments, single_count)``.
|
||||||
|
"""
|
||||||
|
all_fragments: list[Fragment] = []
|
||||||
|
single_count = 0
|
||||||
|
|
||||||
|
if workers <= 1 or "fork" not in mp.get_all_start_methods():
|
||||||
|
for oa_code in tqdm(
|
||||||
|
oa_codes, desc="Processing OAs", unit="OA", smoothing=0.01, miniters=100
|
||||||
|
):
|
||||||
|
oa_frags, is_single = _oa_fragments(
|
||||||
|
oa_code, oa_geoms, east, north, postcodes_arr, offsets, index
|
||||||
|
)
|
||||||
|
all_fragments.extend(oa_frags)
|
||||||
|
single_count += is_single
|
||||||
|
return all_fragments, single_count
|
||||||
|
|
||||||
|
_WORKER_STATE.update(
|
||||||
|
oa_geoms=oa_geoms,
|
||||||
|
east=east,
|
||||||
|
north=north,
|
||||||
|
postcodes=postcodes_arr,
|
||||||
|
offsets=offsets,
|
||||||
|
index=index,
|
||||||
|
)
|
||||||
|
# Many small contiguous chunks → dynamic load balancing across workers (rural
|
||||||
|
# OAs cost far more than urban ones) while preserving mmap read locality.
|
||||||
|
chunk_size = max(1, len(oa_codes) // (workers * 16))
|
||||||
|
chunks = [oa_codes[i : i + chunk_size] for i in range(0, len(oa_codes), chunk_size)]
|
||||||
|
print(f" Parallel: {workers} workers, {len(chunks)} chunks of ~{chunk_size} OAs")
|
||||||
|
|
||||||
|
ctx = mp.get_context("fork")
|
||||||
|
try:
|
||||||
|
with ctx.Pool(processes=workers) as pool:
|
||||||
|
with tqdm(
|
||||||
|
total=len(oa_codes), desc="Processing OAs", unit="OA", smoothing=0.01
|
||||||
|
) as bar:
|
||||||
|
for pcs, wkb, single, n_oas in pool.imap_unordered(
|
||||||
|
_process_oa_chunk, chunks
|
||||||
|
):
|
||||||
|
if len(wkb):
|
||||||
|
all_fragments.extend(zip(pcs, shapely.from_wkb(wkb)))
|
||||||
|
single_count += single
|
||||||
|
bar.update(n_oas)
|
||||||
|
finally:
|
||||||
|
# Drop references so Phase 4 doesn't keep the big inputs alive.
|
||||||
|
_WORKER_STATE.clear()
|
||||||
|
return all_fragments, single_count
|
||||||
|
|
||||||
|
|
||||||
|
def build_fragments(args: argparse.Namespace) -> list[Fragment]:
|
||||||
|
"""Run Phases 1-3: load data, parse INSPIRE, process every OA into fragments.
|
||||||
|
|
||||||
|
Returns the full ``(postcode, geometry)`` fragment list. The large
|
||||||
|
intermediate structures (OA/UPRN/INSPIRE arrays) are locals here, so they are
|
||||||
|
freed as soon as this function returns -- before the fragments are cached or
|
||||||
|
merged.
|
||||||
|
"""
|
||||||
|
# Phase 1: Load all data
|
||||||
|
print("=" * 60)
|
||||||
|
print("Phase 1: Loading data")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
oa_geoms = load_oa_boundaries(args.oa_boundaries)
|
||||||
|
uprn_df, uprn_offsets = load_uprns(args.uprn, args.arcgis)
|
||||||
|
# Convert UPRNs to fork-shareable numpy/Arrow arrays so parallel workers never
|
||||||
|
# call polars (avoids the fork-after-threads hazard of its rayon pool).
|
||||||
|
uprn_east, uprn_north, uprn_postcodes = extract_uprn_arrays(uprn_df)
|
||||||
|
|
||||||
|
# Phase 2: Parse/load INSPIRE
|
||||||
|
print()
|
||||||
|
print("=" * 60)
|
||||||
|
print("Phase 2: INSPIRE data")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
inspire_cache_dir = args.output / "inspire_cache"
|
||||||
|
if not inspire_cache_exists(inspire_cache_dir):
|
||||||
|
cache_inspire(args.inspire, inspire_cache_dir)
|
||||||
|
inspire_bboxes, inspire_offsets, inspire_coords = load_inspire(inspire_cache_dir)
|
||||||
|
inspire_index = build_inspire_index(inspire_bboxes, inspire_offsets, inspire_coords)
|
||||||
|
|
||||||
|
# Phase 3: Process OAs
|
||||||
|
print()
|
||||||
|
print("=" * 60)
|
||||||
|
print("Phase 3: Processing OAs")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Build work list — precompute which OAs are single vs multi-postcode
|
||||||
|
oa_codes_with_data = sorted(set(oa_geoms.keys()) & set(uprn_offsets.keys()))
|
||||||
|
skipped_no_uprn = len(oa_geoms) - len(oa_codes_with_data)
|
||||||
|
skipped_no_boundary = len(uprn_offsets) - len(oa_codes_with_data)
|
||||||
|
|
||||||
|
if args.limit > 0:
|
||||||
|
oa_codes_with_data = oa_codes_with_data[: args.limit]
|
||||||
|
|
||||||
|
print(f" OAs with UPRNs + boundaries: {len(oa_codes_with_data)}")
|
||||||
|
print(f" Skipped (no UPRNs): {skipped_no_uprn}")
|
||||||
|
print(f" Skipped (no boundary): {skipped_no_boundary}")
|
||||||
|
|
||||||
|
# --limit is a debug mode → force deterministic single-process.
|
||||||
|
workers = 1 if args.limit > 0 else _resolve_workers(args.workers)
|
||||||
|
all_fragments, single_count = _process_oas(
|
||||||
|
oa_codes_with_data,
|
||||||
|
oa_geoms,
|
||||||
|
uprn_east,
|
||||||
|
uprn_north,
|
||||||
|
uprn_postcodes,
|
||||||
|
uprn_offsets,
|
||||||
|
inspire_index,
|
||||||
|
workers,
|
||||||
|
)
|
||||||
|
multi_count = len(oa_codes_with_data) - single_count
|
||||||
|
|
||||||
|
print(f"\n Single-postcode OAs (fast path): {single_count}")
|
||||||
|
print(f" Multi-postcode OAs (INSPIRE+Voronoi): {multi_count}")
|
||||||
|
print(f" Total fragments: {len(all_fragments)}")
|
||||||
|
|
||||||
|
return all_fragments
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
|
|
@ -38,6 +246,12 @@ def main() -> None:
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--limit", type=int, default=0, help="Process only first N OAs (0=all)"
|
"--limit", type=int, default=0, help="Process only first N OAs (0=all)"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--workers",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="Parallel worker processes for OA processing (0=all CPUs, 1=sequential)",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--greenspace",
|
"--greenspace",
|
||||||
type=Path,
|
type=Path,
|
||||||
|
|
@ -46,79 +260,30 @@ def main() -> None:
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Phase 1: Load all data
|
fragments_cache = args.output / "fragments_cache.parquet"
|
||||||
|
# Phase 3 depends only on these inputs; greenspace is applied later (Phase 4),
|
||||||
|
# so a greenspace change must not invalidate the fragment cache.
|
||||||
|
fragment_inputs = [args.uprn, args.arcgis, args.oa_boundaries, args.inspire]
|
||||||
|
# --limit yields a partial fragment set; never read or write the shared cache.
|
||||||
|
use_cache = args.limit == 0
|
||||||
|
|
||||||
|
if use_cache and fragments_cache_is_fresh(fragments_cache, fragment_inputs):
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
print("Phase 1: Loading data")
|
print("Phase 3 cache hit — loading fragments (skipping Phases 1-3)")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
|
all_fragments = load_fragments(fragments_cache)
|
||||||
oa_geoms = load_oa_boundaries(args.oa_boundaries)
|
print(
|
||||||
uprn_df, uprn_offsets = load_uprns(args.uprn, args.arcgis)
|
f" Loaded {len(all_fragments):,} cached fragments from {fragments_cache}"
|
||||||
|
|
||||||
# Phase 2: Parse/load INSPIRE
|
|
||||||
print()
|
|
||||||
print("=" * 60)
|
|
||||||
print("Phase 2: INSPIRE data")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
inspire_cache_dir = args.output / "inspire_cache"
|
|
||||||
if not inspire_cache_exists(inspire_cache_dir):
|
|
||||||
cache_inspire(args.inspire, inspire_cache_dir)
|
|
||||||
inspire_bboxes, inspire_offsets, inspire_coords = load_inspire(inspire_cache_dir)
|
|
||||||
|
|
||||||
# Phase 3: Process OAs
|
|
||||||
print()
|
|
||||||
print("=" * 60)
|
|
||||||
print("Phase 3: Processing OAs")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
# Build work list — precompute which OAs are single vs multi-postcode
|
|
||||||
oa_codes_with_data = sorted(set(oa_geoms.keys()) & set(uprn_offsets.keys()))
|
|
||||||
skipped_no_uprn = len(oa_geoms) - len(oa_codes_with_data)
|
|
||||||
skipped_no_boundary = len(uprn_offsets) - len(oa_codes_with_data)
|
|
||||||
|
|
||||||
if args.limit > 0:
|
|
||||||
oa_codes_with_data = oa_codes_with_data[: args.limit]
|
|
||||||
|
|
||||||
print(f" OAs with UPRNs + boundaries: {len(oa_codes_with_data)}")
|
|
||||||
print(f" Skipped (no UPRNs): {skipped_no_uprn}")
|
|
||||||
print(f" Skipped (no boundary): {skipped_no_boundary}")
|
|
||||||
|
|
||||||
all_fragments: list[tuple[str, Polygon | MultiPolygon]] = []
|
|
||||||
single_count = 0
|
|
||||||
multi_count = 0
|
|
||||||
|
|
||||||
for oa_code in tqdm(
|
|
||||||
oa_codes_with_data,
|
|
||||||
desc="Processing OAs",
|
|
||||||
unit="OA",
|
|
||||||
smoothing=0.01,
|
|
||||||
miniters=100,
|
|
||||||
):
|
|
||||||
oa_geom = oa_geoms[oa_code]
|
|
||||||
points, postcodes = get_oa_uprns(uprn_df, uprn_offsets, oa_code)
|
|
||||||
|
|
||||||
if len(set(postcodes)) == 1:
|
|
||||||
# Fast path: entire OA = one postcode
|
|
||||||
all_fragments.append((postcodes[0], oa_geom))
|
|
||||||
single_count += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Get INSPIRE candidates via bbox pre-filter
|
|
||||||
candidates = get_inspire_candidates(
|
|
||||||
oa_geom.bounds, inspire_bboxes, inspire_offsets, inspire_coords
|
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
all_fragments = build_fragments(args)
|
||||||
|
if use_cache:
|
||||||
|
# Persist the expensive Phase-3 output before the cheap-but-fragile
|
||||||
|
# merge/write so any failure there resumes in seconds, not ~10 hours.
|
||||||
|
save_fragments(fragments_cache, all_fragments)
|
||||||
|
print(f" Cached {len(all_fragments):,} fragments to {fragments_cache}")
|
||||||
|
|
||||||
fragments = process_oa(oa_geom, points, postcodes, candidates)
|
# Free Phase-1-3 intermediates (build_fragments' locals) back to the OS.
|
||||||
all_fragments.extend(fragments)
|
|
||||||
multi_count += 1
|
|
||||||
|
|
||||||
print(f"\n Single-postcode OAs (fast path): {single_count}")
|
|
||||||
print(f" Multi-postcode OAs (INSPIRE+Voronoi): {multi_count}")
|
|
||||||
print(f" Total fragments: {len(all_fragments)}")
|
|
||||||
|
|
||||||
# Free data no longer needed
|
|
||||||
del oa_geoms, uprn_df, uprn_offsets
|
|
||||||
del inspire_bboxes, inspire_offsets, inspire_coords
|
|
||||||
release_memory()
|
release_memory()
|
||||||
|
|
||||||
# Phase 4: Merge and write
|
# Phase 4: Merge and write
|
||||||
|
|
@ -145,6 +310,12 @@ def main() -> None:
|
||||||
|
|
||||||
file_count = write_district_geojson(merged, args.output)
|
file_count = write_district_geojson(merged, args.output)
|
||||||
print(f"\n Wrote {file_count} district GeoJSON files to {args.output / 'units'}")
|
print(f"\n Wrote {file_count} district GeoJSON files to {args.output / 'units'}")
|
||||||
|
|
||||||
|
# The cache exists only to survive a crash between Phase 3 and a clean write.
|
||||||
|
# Now that the output is complete, drop it so a later input change can never
|
||||||
|
# be served from a stale cache.
|
||||||
|
if use_cache:
|
||||||
|
fragments_cache.unlink(missing_ok=True)
|
||||||
print("Done!")
|
print("Done!")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -112,37 +112,113 @@ def load_inspire(
|
||||||
return bboxes, offsets, coords_mmap
|
return bboxes, offsets, coords_mmap
|
||||||
|
|
||||||
|
|
||||||
def get_inspire_candidates(
|
# Grid cell size (m) for the parcel spatial index. The median parcel is ~25 m
|
||||||
oa_bounds: tuple[float, float, float, float],
|
# and the 99th percentile ~540 m, so almost every parcel fits inside a single
|
||||||
|
# 1 km cell; the ~0.4% larger than a cell go to an overflow list tested on every
|
||||||
|
# query.
|
||||||
|
_GRID_CELL_SIZE = 1000.0
|
||||||
|
|
||||||
|
|
||||||
|
class InspireIndex:
|
||||||
|
"""Uniform-grid spatial index over INSPIRE parcel bounding boxes.
|
||||||
|
|
||||||
|
The per-OA candidate lookup used to linear-scan all ~24M bboxes (O(N) per
|
||||||
|
OA, ~4 h total over the country). This indexes parcels by grid cell so each
|
||||||
|
lookup is O(cells_spanned + candidates). Parcels no larger than one cell are
|
||||||
|
bucketed by their bbox min-corner cell in a CSR layout (parcel indices sorted
|
||||||
|
by cell id, located with ``searchsorted``); the few parcels larger than a
|
||||||
|
cell are kept in an overflow array tested directly on every query. An exact
|
||||||
|
bbox test then runs on the gathered subset and the result is sorted, so the
|
||||||
|
candidate set -- and its order -- is byte-for-byte identical to the old scan.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
bboxes: np.ndarray,
|
bboxes: np.ndarray,
|
||||||
offsets: np.ndarray,
|
offsets: np.ndarray,
|
||||||
coords_mmap: np.memmap,
|
coords_mmap: np.memmap,
|
||||||
) -> list[Polygon]:
|
cell_size: float = _GRID_CELL_SIZE,
|
||||||
"""Get INSPIRE polygons overlapping an OA via bbox pre-filter.
|
) -> None:
|
||||||
|
self._bboxes = bboxes
|
||||||
|
self._offsets = offsets
|
||||||
|
self._coords = coords_mmap
|
||||||
|
self._cell_size = cell_size
|
||||||
|
self._origin_x = float(bboxes[:, 0].min())
|
||||||
|
self._origin_y = float(bboxes[:, 1].min())
|
||||||
|
# Flattened cell id is ``cx * _ny + cy``; +2 leaves a guard row so the
|
||||||
|
# query's one-cell low-edge widening can never collide with cx-1.
|
||||||
|
self._ny = int((bboxes[:, 1].max() - self._origin_y) // cell_size) + 2
|
||||||
|
|
||||||
|
width = bboxes[:, 2] - bboxes[:, 0]
|
||||||
|
height = bboxes[:, 3] - bboxes[:, 1]
|
||||||
|
small = np.where((width <= cell_size) & (height <= cell_size))[0]
|
||||||
|
self._oversized = np.where((width > cell_size) | (height > cell_size))[0]
|
||||||
|
self._oversized_bb = bboxes[self._oversized]
|
||||||
|
|
||||||
|
cx = ((bboxes[small, 0] - self._origin_x) // cell_size).astype(np.int64)
|
||||||
|
cy = ((bboxes[small, 1] - self._origin_y) // cell_size).astype(np.int64)
|
||||||
|
cell_id = cx * self._ny + cy
|
||||||
|
order = np.argsort(cell_id, kind="stable")
|
||||||
|
self._sorted_cells = cell_id[order]
|
||||||
|
self._cell_parcels = small[order]
|
||||||
|
|
||||||
|
def candidate_indices(self, oa_bounds: tuple[float, float, float, float]) -> np.ndarray:
|
||||||
|
"""Parcel indices whose bbox overlaps ``oa_bounds`` (ascending order)."""
|
||||||
|
min_e, min_n, max_e, max_n = oa_bounds
|
||||||
|
cs = self._cell_size
|
||||||
|
# A small parcel (<= one cell) overlapping the OA has its min-corner no
|
||||||
|
# more than one cell below/left of the OA bbox, so widen the low edges by
|
||||||
|
# a cell. This keeps the lookup free of false negatives.
|
||||||
|
gx0 = int((min_e - cs - self._origin_x) // cs)
|
||||||
|
gx1 = int((max_e - self._origin_x) // cs)
|
||||||
|
gy_lo = int((min_n - cs - self._origin_y) // cs)
|
||||||
|
gy_hi = int((max_n - self._origin_y) // cs)
|
||||||
|
|
||||||
|
parts = []
|
||||||
|
ob = self._oversized_bb
|
||||||
|
if len(ob):
|
||||||
|
mo = (
|
||||||
|
(ob[:, 2] >= min_e)
|
||||||
|
& (ob[:, 0] <= max_e)
|
||||||
|
& (ob[:, 3] >= min_n)
|
||||||
|
& (ob[:, 1] <= max_n)
|
||||||
|
)
|
||||||
|
if mo.any():
|
||||||
|
parts.append(self._oversized[mo])
|
||||||
|
|
||||||
|
for gx in range(gx0, gx1 + 1):
|
||||||
|
base = gx * self._ny
|
||||||
|
lo = np.searchsorted(self._sorted_cells, base + gy_lo, "left")
|
||||||
|
hi = np.searchsorted(self._sorted_cells, base + gy_hi, "right")
|
||||||
|
if hi > lo:
|
||||||
|
parts.append(self._cell_parcels[lo:hi])
|
||||||
|
|
||||||
|
if not parts:
|
||||||
|
return np.empty(0, dtype=np.int64)
|
||||||
|
cand = np.concatenate(parts)
|
||||||
|
cb = self._bboxes[cand]
|
||||||
|
mask = (
|
||||||
|
(cb[:, 2] >= min_e)
|
||||||
|
& (cb[:, 0] <= max_e)
|
||||||
|
& (cb[:, 3] >= min_n)
|
||||||
|
& (cb[:, 1] <= max_n)
|
||||||
|
)
|
||||||
|
# Sort so the candidate order matches the old full np.where scan exactly.
|
||||||
|
return np.sort(cand[mask])
|
||||||
|
|
||||||
|
def candidates(
|
||||||
|
self, oa_bounds: tuple[float, float, float, float]
|
||||||
|
) -> list[Polygon]:
|
||||||
|
"""INSPIRE polygons overlapping an OA, built from the mmap on demand.
|
||||||
|
|
||||||
Builds Shapely objects only for matches (typically 10-500 per OA).
|
Builds Shapely objects only for matches (typically 10-500 per OA).
|
||||||
Reads coordinate data on-demand from memory-mapped file.
|
|
||||||
"""
|
"""
|
||||||
min_e, min_n, max_e, max_n = oa_bounds
|
|
||||||
|
|
||||||
# Vectorized bbox overlap test
|
|
||||||
mask = (
|
|
||||||
(bboxes[:, 2] >= min_e)
|
|
||||||
& (bboxes[:, 0] <= max_e)
|
|
||||||
& (bboxes[:, 3] >= min_n)
|
|
||||||
& (bboxes[:, 1] <= max_n)
|
|
||||||
)
|
|
||||||
idxs = np.where(mask)[0]
|
|
||||||
if len(idxs) == 0:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Build Shapely polygons only for candidates (coords from mmap)
|
|
||||||
candidates = []
|
candidates = []
|
||||||
for i in idxs:
|
for i in self.candidate_indices(oa_bounds):
|
||||||
byte_offset = offsets[i, 0]
|
byte_offset = self._offsets[i, 0]
|
||||||
n_pts = offsets[i, 1]
|
n_pts = self._offsets[i, 1]
|
||||||
float_offset = byte_offset // 8 # float64 = 8 bytes
|
float_offset = byte_offset // 8 # float64 = 8 bytes
|
||||||
coords = coords_mmap[float_offset : float_offset + n_pts * 2].reshape(-1, 2)
|
coords = self._coords[float_offset : float_offset + n_pts * 2].reshape(-1, 2)
|
||||||
poly = Polygon(coords)
|
poly = Polygon(coords)
|
||||||
if not poly.is_valid:
|
if not poly.is_valid:
|
||||||
poly = make_valid(poly)
|
poly = make_valid(poly)
|
||||||
|
|
@ -153,3 +229,13 @@ def get_inspire_candidates(
|
||||||
if not poly.is_empty:
|
if not poly.is_empty:
|
||||||
candidates.append(poly)
|
candidates.append(poly)
|
||||||
return candidates
|
return candidates
|
||||||
|
|
||||||
|
|
||||||
|
def build_inspire_index(
|
||||||
|
bboxes: np.ndarray,
|
||||||
|
offsets: np.ndarray,
|
||||||
|
coords_mmap: np.memmap,
|
||||||
|
cell_size: float = _GRID_CELL_SIZE,
|
||||||
|
) -> InspireIndex:
|
||||||
|
"""Build the grid spatial index used for per-OA candidate retrieval."""
|
||||||
|
return InspireIndex(bboxes, offsets, coords_mmap, cell_size)
|
||||||
|
|
|
||||||
|
|
@ -3,8 +3,9 @@ import shutil
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from pyproj import Transformer
|
from pyproj import Transformer
|
||||||
from shapely import make_valid, set_precision
|
from shapely import STRtree, make_valid, set_precision
|
||||||
from shapely.errors import GEOSException
|
from shapely.errors import GEOSException
|
||||||
from shapely.geometry import MultiPolygon, Polygon, mapping, shape
|
from shapely.geometry import MultiPolygon, Polygon, mapping, shape
|
||||||
from shapely.ops import transform as transform_geometry
|
from shapely.ops import transform as transform_geometry
|
||||||
|
|
@ -41,30 +42,30 @@ def _largest_polygonal(geom) -> Polygon | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def to_wgs84_geojson(
|
# Output coordinate grid (~0.11 m at UK latitudes). Polygons whose extent is
|
||||||
geom: Polygon | MultiPolygon, tolerance: float = 1.0
|
# below this in any direction snap to empty during serialization.
|
||||||
) -> dict | None:
|
_OUTPUT_PRECISION_DEG = 0.000001
|
||||||
"""Simplify geometry in BNG, convert to WGS84, return a valid GeoJSON dict.
|
# Minimal BNG buffer used to rescue sub-grid slivers into a representable
|
||||||
|
# footprint. A near-zero-area Voronoi/INSPIRE spike (e.g. three almost-collinear
|
||||||
|
# vertices) would otherwise vanish at output precision; since every *active*
|
||||||
|
# postcode must keep a boundary (validate_outputs enforces this with zero
|
||||||
|
# tolerance), we fatten it just enough to survive snapping rather than drop it.
|
||||||
|
_MIN_FOOTPRINT_BUFFER_M = 0.5
|
||||||
|
|
||||||
|
|
||||||
|
def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
|
||||||
|
"""Transform a BNG polygon to WGS84, snap to output precision, validate.
|
||||||
|
|
||||||
Validates the *serialized* GeoJSON dict (via a ``shape()`` round-trip), not
|
Validates the *serialized* GeoJSON dict (via a ``shape()`` round-trip), not
|
||||||
just the intermediate Shapely object: coordinate snapping during
|
just the intermediate Shapely object: coordinate snapping during
|
||||||
serialization can otherwise leave a self-intersecting ring that only shows up
|
serialization can otherwise leave a self-intersecting ring that only shows up
|
||||||
once the feature is read back from disk. Any such geometry is repaired with
|
once the feature is read back from disk. Returns ``None`` if the geometry
|
||||||
``make_valid`` before returning so written features are always valid.
|
collapses to empty (a sub-grid sliver).
|
||||||
"""
|
"""
|
||||||
geom = _largest_polygonal(geom)
|
|
||||||
if geom is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
simplified = geom.simplify(tolerance, preserve_topology=True)
|
|
||||||
simplified = _largest_polygonal(simplified)
|
|
||||||
if simplified is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
transformer = _get_to_wgs84()
|
transformer = _get_to_wgs84()
|
||||||
wgs84 = transform_geometry(transformer.transform, simplified)
|
wgs84 = transform_geometry(transformer.transform, geom_bng)
|
||||||
try:
|
try:
|
||||||
wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
|
wgs84 = set_precision(wgs84, _OUTPUT_PRECISION_DEG, mode="valid_output")
|
||||||
except GEOSException:
|
except GEOSException:
|
||||||
# Precision snapping can fail on pathological geometries; fall back to a
|
# Precision snapping can fail on pathological geometries; fall back to a
|
||||||
# plain validity repair without coordinate snapping.
|
# plain validity repair without coordinate snapping.
|
||||||
|
|
@ -87,20 +88,105 @@ def to_wgs84_geojson(
|
||||||
return geojson_dict
|
return geojson_dict
|
||||||
|
|
||||||
|
|
||||||
|
def _rescue_footprint(geom_bng) -> dict | None:
|
||||||
|
"""Fatten a degenerate BNG geometry into a representable footprint and snap."""
|
||||||
|
footprint = _largest_polygonal(geom_bng.buffer(_MIN_FOOTPRINT_BUFFER_M))
|
||||||
|
if footprint is None:
|
||||||
|
return None
|
||||||
|
return _snap_to_wgs84_geojson(footprint)
|
||||||
|
|
||||||
|
|
||||||
|
def to_wgs84_geojson(
|
||||||
|
geom: Polygon | MultiPolygon, tolerance: float = 1.0
|
||||||
|
) -> dict | None:
|
||||||
|
"""Simplify geometry in BNG, convert to WGS84, return a valid GeoJSON dict.
|
||||||
|
|
||||||
|
A few thousand postcodes reduce to a sub-grid sliver that snaps to empty at
|
||||||
|
output precision. Dropping them would leave an active postcode with no
|
||||||
|
boundary (validate_outputs rejects that with zero tolerance), so instead they
|
||||||
|
are fattened into a minimal footprint at the right location: first by buffering
|
||||||
|
the (often elongated) sliver itself, then -- for fully-degenerate input -- a
|
||||||
|
small disc around ``representative_point()``, which lies inside any non-empty
|
||||||
|
geometry. ``None`` is returned only for a genuinely empty input.
|
||||||
|
"""
|
||||||
|
if geom is None or geom.is_empty:
|
||||||
|
return None
|
||||||
|
|
||||||
|
cleaned = _largest_polygonal(geom)
|
||||||
|
if cleaned is not None:
|
||||||
|
simplified = _largest_polygonal(
|
||||||
|
cleaned.simplify(tolerance, preserve_topology=True)
|
||||||
|
)
|
||||||
|
if simplified is None:
|
||||||
|
simplified = cleaned
|
||||||
|
# Normal path; if snapping erases a thin sliver, fatten its real shape.
|
||||||
|
result = _snap_to_wgs84_geojson(simplified)
|
||||||
|
if result is None:
|
||||||
|
result = _rescue_footprint(simplified)
|
||||||
|
if result is not None:
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Universal fallback for input too degenerate to clean or fatten in place.
|
||||||
|
return _rescue_footprint(geom.representative_point())
|
||||||
|
|
||||||
|
|
||||||
|
def to_wgs84_geojson_multi(
|
||||||
|
geom: Polygon | MultiPolygon, tolerance: float = 1.0
|
||||||
|
) -> dict | None:
|
||||||
|
"""Convert a (possibly multi-part) postcode geometry to a GeoJSON dict,
|
||||||
|
preserving every part. Each part is simplified/snapped/rescued independently
|
||||||
|
via :func:`to_wgs84_geojson`; the result is a ``Polygon`` for a single part or
|
||||||
|
a ``MultiPolygon`` for several. ``None`` only if every part is degenerate.
|
||||||
|
"""
|
||||||
|
parts = list(geom.geoms) if geom.geom_type == "MultiPolygon" else [geom]
|
||||||
|
part_dicts = [d for part in parts if (d := to_wgs84_geojson(part, tolerance))]
|
||||||
|
if not part_dicts:
|
||||||
|
return None
|
||||||
|
if len(part_dicts) == 1:
|
||||||
|
return part_dicts[0]
|
||||||
|
return {
|
||||||
|
"type": "MultiPolygon",
|
||||||
|
"coordinates": [pd["coordinates"] for pd in part_dicts],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Interior holes from the INSPIRE+Voronoi+make_valid chain are small artifacts and
|
||||||
|
# get filled. A hole at least this large is likely a genuinely enclosed postcode
|
||||||
|
# (kept, so we never solidify over a neighbour); the de-overlap pass is the real
|
||||||
|
# guarantee, this is defence-in-depth.
|
||||||
|
_MAX_ARTIFACT_HOLE_AREA = 1000.0
|
||||||
|
|
||||||
|
|
||||||
|
def _fill_small_holes(poly: Polygon) -> Polygon:
|
||||||
|
kept = [r for r in poly.interiors if Polygon(r).area >= _MAX_ARTIFACT_HOLE_AREA]
|
||||||
|
return Polygon(poly.exterior, kept)
|
||||||
|
|
||||||
|
|
||||||
def _fill_holes(geom):
|
def _fill_holes(geom):
|
||||||
"""Remove all interior rings (holes) from a polygon or multipolygon."""
|
"""Fill small artifact interior rings; keep large (real-enclosed) holes."""
|
||||||
if geom.geom_type == "Polygon":
|
if geom.geom_type == "Polygon":
|
||||||
return Polygon(geom.exterior)
|
return _fill_small_holes(geom)
|
||||||
elif geom.geom_type == "MultiPolygon":
|
elif geom.geom_type == "MultiPolygon":
|
||||||
return MultiPolygon([Polygon(p.exterior) for p in geom.geoms])
|
return MultiPolygon([_fill_small_holes(p) for p in geom.geoms])
|
||||||
return geom
|
return geom
|
||||||
|
|
||||||
|
|
||||||
def _largest_polygon(geom):
|
# A postcode genuinely split across an OA seam (by a railway, river, or main road
|
||||||
"""Extract the largest polygon from a MultiPolygon."""
|
# wider than the merge buffer) arrives here as a MultiPolygon. Keeping only the
|
||||||
if geom.geom_type == "MultiPolygon":
|
# largest part used to discard the rest, leaving ~1.8% of merged area as uncovered
|
||||||
return max(geom.geoms, key=lambda g: g.area)
|
# gaps (often 3000-5000 m² building blocks). Keep every part at least this big;
|
||||||
|
# smaller detached bits are Voronoi/clipping noise and are still dropped.
|
||||||
|
_MIN_DETACHED_PART_AREA = 100.0
|
||||||
|
|
||||||
|
|
||||||
|
def _keep_polygon_parts(geom):
|
||||||
|
"""Keep all MultiPolygon parts >= _MIN_DETACHED_PART_AREA (largest if none)."""
|
||||||
|
if geom.geom_type != "MultiPolygon":
|
||||||
return geom
|
return geom
|
||||||
|
parts = [g for g in geom.geoms if g.area >= _MIN_DETACHED_PART_AREA]
|
||||||
|
if not parts:
|
||||||
|
parts = [max(geom.geoms, key=lambda g: g.area)]
|
||||||
|
return parts[0] if len(parts) == 1 else MultiPolygon(parts)
|
||||||
|
|
||||||
|
|
||||||
def merge_fragments(
|
def merge_fragments(
|
||||||
|
|
@ -126,14 +212,19 @@ def merge_fragments(
|
||||||
continue
|
continue
|
||||||
if not combined.is_valid:
|
if not combined.is_valid:
|
||||||
combined = make_valid(combined)
|
combined = make_valid(combined)
|
||||||
# Close tiny gaps between adjacent OA boundary edges (float mismatches)
|
# Close tiny gaps between adjacent OA boundary edges (float mismatches).
|
||||||
|
# The closing can erode a tiny MultiPolygon (e.g. a postcode with only a
|
||||||
|
# sliver fragment) to nothing, which would leave the postcode with no
|
||||||
|
# geometry at all — keep the un-closed shape if that happens.
|
||||||
if combined.geom_type == "MultiPolygon":
|
if combined.geom_type == "MultiPolygon":
|
||||||
combined = combined.buffer(5.0).buffer(-5.0)
|
closed = combined.buffer(5.0).buffer(-5.0)
|
||||||
if not combined.is_valid:
|
if not closed.is_valid:
|
||||||
combined = make_valid(combined)
|
closed = make_valid(closed)
|
||||||
# Postcodes are contiguous delivery routes — keep only the largest
|
if not closed.is_empty:
|
||||||
# polygon; small detached fragments are algorithm artifacts
|
combined = closed
|
||||||
combined = _largest_polygon(combined)
|
# Keep the postcode whole: the largest part plus any other substantial
|
||||||
|
# part (a genuine railway/river split), dropping only tiny noise slivers.
|
||||||
|
combined = _keep_polygon_parts(combined)
|
||||||
# Remove artifact interior holes from INSPIRE+Voronoi+make_valid chain
|
# Remove artifact interior holes from INSPIRE+Voronoi+make_valid chain
|
||||||
combined = _fill_holes(combined)
|
combined = _fill_holes(combined)
|
||||||
# Subtract parks/water if provided
|
# Subtract parks/water if provided
|
||||||
|
|
@ -142,7 +233,7 @@ def merge_fragments(
|
||||||
|
|
||||||
pre_green = combined
|
pre_green = combined
|
||||||
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
|
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
|
||||||
combined = _largest_polygon(combined)
|
combined = _keep_polygon_parts(combined)
|
||||||
# Do NOT _fill_holes here: interior holes carved by the greenspace
|
# Do NOT _fill_holes here: interior holes carved by the greenspace
|
||||||
# subtraction (lakes, enclosed parks) are intentional, not artifacts.
|
# subtraction (lakes, enclosed parks) are intentional, not artifacts.
|
||||||
# Filling them would re-add the removed area and negate the
|
# Filling them would re-add the removed area and negate the
|
||||||
|
|
@ -155,10 +246,114 @@ def merge_fragments(
|
||||||
return merged
|
return merged
|
||||||
|
|
||||||
|
|
||||||
|
def _polygonal(geom):
|
||||||
|
"""Return only the polygonal part(s) of a geometry, or None if none remain."""
|
||||||
|
if geom is None or geom.is_empty:
|
||||||
|
return None
|
||||||
|
if geom.geom_type in ("Polygon", "MultiPolygon"):
|
||||||
|
return geom
|
||||||
|
if geom.geom_type == "GeometryCollection":
|
||||||
|
polys = [
|
||||||
|
g
|
||||||
|
for g in geom.geoms
|
||||||
|
if g.geom_type in ("Polygon", "MultiPolygon") and not g.is_empty
|
||||||
|
]
|
||||||
|
if not polys:
|
||||||
|
return None
|
||||||
|
merged = unary_union(polys)
|
||||||
|
return merged if not merged.is_empty else None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_overlaps(
|
||||||
|
items: list[tuple[str, Polygon | MultiPolygon]],
|
||||||
|
) -> list[tuple[str, Polygon | MultiPolygon]]:
|
||||||
|
"""Make the postcode polygons a partition: no two cover the same ground.
|
||||||
|
|
||||||
|
Overlap appears at OA seams (the 5m merge buffer expands each postcode
|
||||||
|
independently), from simplifying each postcode on its own, and as genuine
|
||||||
|
containment (a postcode fully enclosed by another). Each postcode is trimmed
|
||||||
|
by the union of its higher-priority overlapping neighbours, where **priority =
|
||||||
|
ascending area**: a smaller postcode wins contested ground. That single rule
|
||||||
|
handles both cases correctly — an enclosed postcode is always smaller than its
|
||||||
|
container, so it keeps its area while the container gets a hole (a `overlaps`
|
||||||
|
query alone would miss containment entirely). Run last, on the final output
|
||||||
|
geometries, so nothing re-introduces overlap afterwards. A postcode that would
|
||||||
|
be emptied keeps its original geometry, so an active postcode is never dropped.
|
||||||
|
"""
|
||||||
|
geoms = [g for _, g in items]
|
||||||
|
n = len(geoms)
|
||||||
|
if n < 2:
|
||||||
|
return items
|
||||||
|
|
||||||
|
# rank[i]: 0 = highest priority (smallest area). Postcode string breaks ties
|
||||||
|
# for determinism.
|
||||||
|
rank = {
|
||||||
|
idx: r
|
||||||
|
for r, idx in enumerate(
|
||||||
|
sorted(range(n), key=lambda i: (geoms[i].area, items[i][0]))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
tree = STRtree(geoms)
|
||||||
|
arr = np.array(geoms, dtype=object)
|
||||||
|
pairs: set[tuple[int, int]] = set()
|
||||||
|
# "overlaps" gives partial overlaps; "contains" gives containment (which
|
||||||
|
# "overlaps" excludes) — together they cover every 2-D overlap without the
|
||||||
|
# edge-touch explosion a plain "intersects" query would add.
|
||||||
|
for predicate in ("overlaps", "contains"):
|
||||||
|
qsrc, qtgt = tree.query(arr, predicate=predicate)
|
||||||
|
for s, t in zip(qsrc.tolist(), qtgt.tolist()):
|
||||||
|
if s != t:
|
||||||
|
pairs.add((s, t) if s < t else (t, s))
|
||||||
|
|
||||||
|
# For each loser (lower priority) the higher-priority neighbours to subtract.
|
||||||
|
higher: dict[int, list[int]] = defaultdict(list)
|
||||||
|
for a, b in pairs:
|
||||||
|
winner, loser = (a, b) if rank[a] < rank[b] else (b, a)
|
||||||
|
higher[loser].append(winner)
|
||||||
|
|
||||||
|
out = list(geoms)
|
||||||
|
# Process losers from highest priority down, so every subtracted neighbour is
|
||||||
|
# already finalised.
|
||||||
|
for i in sorted(higher, key=lambda idx: rank[idx]):
|
||||||
|
cut = unary_union([out[j] for j in higher[i]])
|
||||||
|
trimmed = out[i].difference(cut)
|
||||||
|
if not trimmed.is_valid:
|
||||||
|
trimmed = make_valid(trimmed)
|
||||||
|
# Keep all polygonal parts: these geometries are in WGS84 degrees, so an
|
||||||
|
# area threshold here would wrongly drop everything but the largest part
|
||||||
|
# and re-open the very gaps the seam fix closed.
|
||||||
|
trimmed = _polygonal(trimmed)
|
||||||
|
if trimmed is not None and not trimmed.is_empty:
|
||||||
|
out[i] = trimmed
|
||||||
|
return [(pc, out[i]) for i, (pc, _) in enumerate(items)]
|
||||||
|
|
||||||
|
|
||||||
|
def _round_coords(coords, ndigits=6):
|
||||||
|
if coords and isinstance(coords[0], (int, float)):
|
||||||
|
return [round(coords[0], ndigits), round(coords[1], ndigits)]
|
||||||
|
return [_round_coords(c, ndigits) for c in coords]
|
||||||
|
|
||||||
|
|
||||||
|
def _geojson_geometry(geom) -> dict | None:
|
||||||
|
"""Serialize a WGS84 polygon/multipolygon to a 6dp GeoJSON dict, or None."""
|
||||||
|
geom = _polygonal(geom if geom.is_valid else make_valid(geom))
|
||||||
|
if geom is None or geom.is_empty:
|
||||||
|
return None
|
||||||
|
gj = mapping(geom)
|
||||||
|
return {"type": gj["type"], "coordinates": _round_coords(gj["coordinates"])}
|
||||||
|
|
||||||
|
|
||||||
def write_district_geojson(
|
def write_district_geojson(
|
||||||
postcodes: dict[str, Polygon | MultiPolygon], output_dir: Path
|
postcodes: dict[str, Polygon | MultiPolygon], output_dir: Path
|
||||||
) -> int:
|
) -> int:
|
||||||
"""Group postcodes by district, write GeoJSON files. Returns file count."""
|
"""Group postcodes by district, write GeoJSON files. Returns file count.
|
||||||
|
|
||||||
|
Before writing, the postcode polygons are converted to their final WGS84 form
|
||||||
|
and made a partition (overlaps removed) so the output never has two postcodes
|
||||||
|
covering the same ground.
|
||||||
|
"""
|
||||||
units_dir = output_dir / "units"
|
units_dir = output_dir / "units"
|
||||||
tmp_units_dir = output_dir / "units.tmp"
|
tmp_units_dir = output_dir / "units.tmp"
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
@ -166,38 +361,46 @@ def write_district_geojson(
|
||||||
shutil.rmtree(tmp_units_dir)
|
shutil.rmtree(tmp_units_dir)
|
||||||
tmp_units_dir.mkdir(parents=True)
|
tmp_units_dir.mkdir(parents=True)
|
||||||
|
|
||||||
|
skipped: list[str] = []
|
||||||
|
|
||||||
|
# Pass 1: convert every postcode to its final WGS84 geometry (simplify, snap,
|
||||||
|
# sliver-rescue, multi-part preserved). Sorted → deterministic de-overlap
|
||||||
|
# priority. to_wgs84_geojson_multi returns None only for a genuinely empty
|
||||||
|
# input, which is skipped and reported rather than aborting a multi-hour run.
|
||||||
|
converted: list[tuple[str, Polygon | MultiPolygon]] = []
|
||||||
|
for pc in sorted(postcodes):
|
||||||
|
gj = to_wgs84_geojson_multi(postcodes[pc])
|
||||||
|
if gj is None:
|
||||||
|
skipped.append(pc)
|
||||||
|
continue
|
||||||
|
converted.append((pc, shape(gj)))
|
||||||
|
|
||||||
|
# Remove overlap strips so the output is a clean partition.
|
||||||
|
converted = _resolve_overlaps(converted)
|
||||||
|
|
||||||
by_district: dict[str, list[tuple[str, Polygon | MultiPolygon]]] = defaultdict(list)
|
by_district: dict[str, list[tuple[str, Polygon | MultiPolygon]]] = defaultdict(list)
|
||||||
for pc, geom in postcodes.items():
|
for pc, geom in converted:
|
||||||
parts = pc.split()
|
parts = pc.split()
|
||||||
district = parts[0] if parts else pc[:4]
|
district = parts[0] if parts else pc[:4]
|
||||||
by_district[district].append((pc, geom))
|
by_district[district].append((pc, geom))
|
||||||
|
|
||||||
file_count = 0
|
file_count = 0
|
||||||
seen_postcodes: set[str] = set()
|
|
||||||
for district, entries in tqdm(
|
for district, entries in tqdm(
|
||||||
sorted(by_district.items()), desc="Writing GeoJSON", unit="file"
|
sorted(by_district.items()), desc="Writing GeoJSON", unit="file"
|
||||||
):
|
):
|
||||||
features = []
|
features = []
|
||||||
for pc, geom in sorted(entries, key=lambda x: x[0]):
|
for pc, geom in sorted(entries, key=lambda x: x[0]):
|
||||||
if pc in seen_postcodes:
|
geojson_geom = _geojson_geometry(geom)
|
||||||
raise ValueError(f"Duplicate postcode boundary feature: {pc}")
|
|
||||||
seen_postcodes.add(pc)
|
|
||||||
geojson_geom = to_wgs84_geojson(geom)
|
|
||||||
if geojson_geom is None:
|
if geojson_geom is None:
|
||||||
raise ValueError(f"Postcode boundary collapsed to empty geometry: {pc}")
|
skipped.append(pc)
|
||||||
written_geom = shape(geojson_geom)
|
continue
|
||||||
if written_geom.is_empty or not written_geom.is_valid:
|
|
||||||
raise ValueError(
|
|
||||||
f"Invalid postcode boundary geometry after output: {pc}"
|
|
||||||
)
|
|
||||||
mapit_code = pc.replace(" ", "")
|
|
||||||
features.append(
|
features.append(
|
||||||
{
|
{
|
||||||
"type": "Feature",
|
"type": "Feature",
|
||||||
"geometry": geojson_geom,
|
"geometry": geojson_geom,
|
||||||
"properties": {
|
"properties": {
|
||||||
"postcodes": pc,
|
"postcodes": pc,
|
||||||
"mapit_code": mapit_code,
|
"mapit_code": pc.replace(" ", ""),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
@ -211,6 +414,14 @@ def write_district_geojson(
|
||||||
json.dump(collection, f, separators=(",", ":"))
|
json.dump(collection, f, separators=(",", ":"))
|
||||||
file_count += 1
|
file_count += 1
|
||||||
|
|
||||||
|
if skipped:
|
||||||
|
preview = ", ".join(skipped[:10])
|
||||||
|
suffix = " …" if len(skipped) > 10 else ""
|
||||||
|
print(
|
||||||
|
f" Skipped {len(skipped)} postcode(s) with degenerate (sub-grid) "
|
||||||
|
f"geometry: {preview}{suffix}"
|
||||||
|
)
|
||||||
|
|
||||||
if units_dir.exists():
|
if units_dir.exists():
|
||||||
shutil.rmtree(units_dir)
|
shutil.rmtree(units_dir)
|
||||||
tmp_units_dir.replace(units_dir)
|
tmp_units_dir.replace(units_dir)
|
||||||
|
|
|
||||||
|
|
@ -85,19 +85,42 @@ def _claim_inspire_parcels(
|
||||||
uprn_pts = shp_points(points)
|
uprn_pts = shp_points(points)
|
||||||
pt_idx, cand_idx = cand_tree.query(uprn_pts, predicate="within")
|
pt_idx, cand_idx = cand_tree.query(uprn_pts, predicate="within")
|
||||||
|
|
||||||
# First priority: parcels that physically contain UPRNs. Majority vote
|
# First priority: parcels that physically contain UPRNs. A parcel holding
|
||||||
# resolves blocks of flats or overlapping parcel data.
|
# UPRNs from a single postcode goes wholly to that postcode. A parcel shared
|
||||||
|
# by several postcodes (a block of flats spanning postcodes, or overlapping
|
||||||
|
# parcel data) is split between them via a sub-Voronoi over their own UPRNs
|
||||||
|
# clipped to the parcel — so EVERY contained postcode keeps part of the
|
||||||
|
# parcel. A bare majority vote would hand the whole parcel to one winner and
|
||||||
|
# leave the losers' UPRNs trapped inside claimed land, dropping them from
|
||||||
|
# both this claim and the `remaining` polygon handed to Voronoi downstream.
|
||||||
cand_postcodes: dict[int, list[str]] = defaultdict(list)
|
cand_postcodes: dict[int, list[str]] = defaultdict(list)
|
||||||
|
cand_point_idx: dict[int, list[int]] = defaultdict(list)
|
||||||
for pi, ci in zip(pt_idx, cand_idx):
|
for pi, ci in zip(pt_idx, cand_idx):
|
||||||
cand_postcodes[ci].append(postcodes[pi])
|
cand_postcodes[ci].append(postcodes[pi])
|
||||||
|
cand_point_idx[ci].append(pi)
|
||||||
|
|
||||||
|
points_f64 = points.astype(np.float64, copy=False)
|
||||||
contained_parts: dict[str, list] = defaultdict(list)
|
contained_parts: dict[str, list] = defaultdict(list)
|
||||||
contained_scores: Counter[str] = Counter()
|
contained_scores: Counter[str] = Counter()
|
||||||
for ci, pc_list in cand_postcodes.items():
|
for ci, pc_list in cand_postcodes.items():
|
||||||
pc_counts = Counter(pc_list)
|
pc_counts = Counter(pc_list)
|
||||||
winner, votes = pc_counts.most_common(1)[0]
|
if len(pc_counts) == 1:
|
||||||
|
winner = next(iter(pc_counts))
|
||||||
contained_parts[winner].append(parcels[ci])
|
contained_parts[winner].append(parcels[ci])
|
||||||
contained_scores[winner] += votes
|
contained_scores[winner] += pc_counts[winner]
|
||||||
|
continue
|
||||||
|
# Shared parcel: sub-Voronoi over the contained UPRNs so each postcode
|
||||||
|
# present keeps a fragment instead of being absorbed by the winner.
|
||||||
|
sub_idx = cand_point_idx[ci]
|
||||||
|
sub_points = points_f64[sub_idx]
|
||||||
|
sub_postcodes = [postcodes[pi] for pi in sub_idx]
|
||||||
|
for pc, geom in compute_voronoi_regions(
|
||||||
|
sub_points, sub_postcodes, parcels[ci]
|
||||||
|
).items():
|
||||||
|
cleaned = _clean_polygonal(geom)
|
||||||
|
if cleaned is not None:
|
||||||
|
contained_parts[pc].append(cleaned)
|
||||||
|
contained_scores[pc] += pc_counts[pc]
|
||||||
|
|
||||||
contained_claimed = _merge_parts_by_postcode(contained_parts)
|
contained_claimed = _merge_parts_by_postcode(contained_parts)
|
||||||
contained_claims = sorted(
|
contained_claims = sorted(
|
||||||
|
|
@ -109,7 +132,6 @@ def _claim_inspire_parcels(
|
||||||
# each to the nearest UPRN/postcode so parcel boundaries carry more of the
|
# each to the nearest UPRN/postcode so parcel boundaries carry more of the
|
||||||
# visible postcode shape; Voronoi is then limited to roads, parks, water, and
|
# visible postcode shape; Voronoi is then limited to roads, parks, water, and
|
||||||
# any other non-parcel gaps.
|
# any other non-parcel gaps.
|
||||||
points_f64 = points.astype(np.float64, copy=False)
|
|
||||||
contained_union = _union_claims(contained_claims)
|
contained_union = _union_claims(contained_claims)
|
||||||
nearest_tree = cKDTree(points_f64)
|
nearest_tree = cKDTree(points_f64)
|
||||||
nearest_parts: dict[str, list] = defaultdict(list)
|
nearest_parts: dict[str, list] = defaultdict(list)
|
||||||
|
|
@ -235,11 +257,11 @@ def _extract_polygonal(geom) -> Polygon | MultiPolygon | None:
|
||||||
return None
|
return None
|
||||||
if len(polys) == 1:
|
if len(polys) == 1:
|
||||||
return polys[0]
|
return polys[0]
|
||||||
return MultiPolygon(
|
# Union (not bare MultiPolygon construction): make_valid can emit
|
||||||
[
|
# overlapping polygonal parts, and a MultiPolygon of overlapping parts is
|
||||||
p
|
# invalid — it double-counts area and makes the next `.difference()` raise
|
||||||
for g in polys
|
# a TopologyException that aborts the OA (and, in parallel mode, the
|
||||||
for p in (g.geoms if g.geom_type == "MultiPolygon" else [g])
|
# worker). unary_union merges them into a valid geometry.
|
||||||
]
|
merged = unary_union(polys)
|
||||||
)
|
return merged if not merged.is_empty else None
|
||||||
return None
|
return None
|
||||||
|
|
|
||||||
|
|
@ -11,12 +11,20 @@ import pytest
|
||||||
from shapely.geometry import MultiPolygon, Polygon, box
|
from shapely.geometry import MultiPolygon, Polygon, box
|
||||||
from shapely.ops import unary_union
|
from shapely.ops import unary_union
|
||||||
|
|
||||||
|
from .fragments_cache import (
|
||||||
|
fragments_cache_is_fresh,
|
||||||
|
load_fragments,
|
||||||
|
save_fragments,
|
||||||
|
)
|
||||||
|
from .__main__ import _oa_fragments, _process_oas
|
||||||
|
from .inspire import build_inspire_index
|
||||||
from .oa_boundaries import parse_gpkg_geometry
|
from .oa_boundaries import parse_gpkg_geometry
|
||||||
from .greenspace import subtract_greenspace
|
from .greenspace import subtract_greenspace
|
||||||
from .output import (
|
from .output import (
|
||||||
_fill_holes,
|
_fill_holes,
|
||||||
merge_fragments,
|
merge_fragments,
|
||||||
to_wgs84_geojson,
|
to_wgs84_geojson,
|
||||||
|
to_wgs84_geojson_multi,
|
||||||
write_district_geojson,
|
write_district_geojson,
|
||||||
)
|
)
|
||||||
from .process_oa import _extract_polygonal, process_oa
|
from .process_oa import _extract_polygonal, process_oa
|
||||||
|
|
@ -173,6 +181,52 @@ class TestWhitespacePostcodes:
|
||||||
|
|
||||||
assert loaded_df["PCDS"].to_list() == ["AA1 1AB"]
|
assert loaded_df["PCDS"].to_list() == ["AA1 1AB"]
|
||||||
|
|
||||||
|
def test_remapped_terminated_postcode_adopts_successor_oa(self, tmp_path):
|
||||||
|
"""When a terminated postcode is remapped to its active successor, the
|
||||||
|
remapped seed point must carry the SUCCESSOR's OA (and coords), not the
|
||||||
|
terminated postcode's original OA. Pre-fix the row kept OA21CD of the
|
||||||
|
terminated postcode, seeding the successor into an OA it doesn't belong
|
||||||
|
to and splitting its boundary across OAs."""
|
||||||
|
# Terminated AA1 1AA sits in OA E00000001. Its nearest active successor
|
||||||
|
# AA1 1AB lives in a DIFFERENT OA (E00000002) far away.
|
||||||
|
uprns = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"GRIDGB1E": [500010],
|
||||||
|
"GRIDGB1N": [180010],
|
||||||
|
"PCDS": ["AA1 1AA"],
|
||||||
|
"OA21CD": ["E00000001"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
uprn_path = tmp_path / "uprn.parquet"
|
||||||
|
uprns.write_parquet(uprn_path)
|
||||||
|
arcgis = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"pcds": ["AA1 1AA", "AA1 1AB"],
|
||||||
|
"east1m": [500010, 500030],
|
||||||
|
"north1m": [180010, 180020],
|
||||||
|
# AA1 1AA terminated → only AA1 1AB is an active successor, and
|
||||||
|
# it belongs to a different OA than the terminated postcode.
|
||||||
|
"oa21cd": ["E00000001", "E00000002"],
|
||||||
|
"doterm": ["2020-01-01", None],
|
||||||
|
"ctry25cd": ["E92000001", "E92000001"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
arcgis_path = tmp_path / "arcgis.parquet"
|
||||||
|
arcgis.write_parquet(arcgis_path)
|
||||||
|
|
||||||
|
loaded_df, offsets = load_uprns(uprn_path, arcgis_path)
|
||||||
|
|
||||||
|
# The remapped point must be grouped under the successor's OA, not the
|
||||||
|
# terminated postcode's OA.
|
||||||
|
assert "E00000002" in offsets, "Successor OA missing — remap kept old OA"
|
||||||
|
assert "E00000001" not in offsets, (
|
||||||
|
"Remapped point still lives in the terminated postcode's OA"
|
||||||
|
)
|
||||||
|
points, postcodes = get_oa_uprns(loaded_df, offsets, "E00000002")
|
||||||
|
assert postcodes == ["AA1 1AB"]
|
||||||
|
# It should also adopt the successor's authoritative coordinates.
|
||||||
|
assert points.tolist() == [[500030.0, 180020.0]]
|
||||||
|
|
||||||
def test_arcgis_filters_to_active_english_postcodes(self, tmp_path):
|
def test_arcgis_filters_to_active_english_postcodes(self, tmp_path):
|
||||||
uprns = pl.DataFrame(
|
uprns = pl.DataFrame(
|
||||||
{
|
{
|
||||||
|
|
@ -617,6 +671,32 @@ class TestProcessOAInspireParcelAssignment:
|
||||||
for _, geom in fragments:
|
for _, geom in fragments:
|
||||||
assert geom.difference(oa_geom).area < 0.01
|
assert geom.difference(oa_geom).area < 0.01
|
||||||
|
|
||||||
|
def test_shared_parcel_keeps_every_contained_postcode(self):
|
||||||
|
"""A single parcel containing UPRNs for [A, A, B] must yield a fragment
|
||||||
|
for BOTH A and B. Pre-fix the majority winner (A) claimed the whole
|
||||||
|
parcel, excluding it from `remaining`, so B's UPRNs were trapped inside
|
||||||
|
claimed land and B vanished entirely (no fragment)."""
|
||||||
|
oa_geom = box(0, 0, 100, 100)
|
||||||
|
parcel = box(0, 0, 100, 100) # one parcel covering the whole OA
|
||||||
|
points = np.array(
|
||||||
|
[
|
||||||
|
[20, 50], # postcode A
|
||||||
|
[30, 50], # postcode A (majority)
|
||||||
|
[80, 50], # postcode B (minority — would be dropped pre-fix)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
postcodes = ["A", "A", "B"]
|
||||||
|
|
||||||
|
fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[parcel])
|
||||||
|
frag_dict = dict(fragments)
|
||||||
|
|
||||||
|
assert "A" in frag_dict, "Majority postcode A must keep a fragment"
|
||||||
|
assert "B" in frag_dict, "Minority postcode B must not be dropped"
|
||||||
|
assert frag_dict["A"].area > 0
|
||||||
|
assert frag_dict["B"].area > 0
|
||||||
|
# The split must partition the parcel without overlap.
|
||||||
|
assert frag_dict["A"].intersection(frag_dict["B"]).area < 0.01
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# _extract_polygonal helper
|
# _extract_polygonal helper
|
||||||
|
|
@ -656,6 +736,21 @@ class TestExtractPolygonal:
|
||||||
|
|
||||||
assert _extract_polygonal(LineString([(0, 0), (1, 1)])) is None
|
assert _extract_polygonal(LineString([(0, 0), (1, 1)])) is None
|
||||||
|
|
||||||
|
def test_overlapping_collection_unioned_to_valid(self):
|
||||||
|
"""A GeometryCollection with OVERLAPPING polygons must be unioned into a
|
||||||
|
VALID geometry (not a raw MultiPolygon, which would be invalid and crash
|
||||||
|
the next .difference()), and must not double-count the overlap area."""
|
||||||
|
from shapely.geometry import GeometryCollection
|
||||||
|
|
||||||
|
a = box(0, 0, 100, 100)
|
||||||
|
b = box(50, 50, 150, 150) # overlaps a by 50x50
|
||||||
|
result = _extract_polygonal(GeometryCollection([a, b]))
|
||||||
|
assert result is not None
|
||||||
|
assert result.is_valid
|
||||||
|
assert result.area == pytest.approx(unary_union([a, b]).area)
|
||||||
|
# And the formerly-crashing op now works:
|
||||||
|
assert result.difference(box(0, 0, 10, 10)).is_valid
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Edge case: merge_fragments handles single-OA postcodes
|
# Edge case: merge_fragments handles single-OA postcodes
|
||||||
|
|
@ -763,12 +858,12 @@ class TestParseGpkgGeometry:
|
||||||
|
|
||||||
|
|
||||||
class TestFillHoles:
|
class TestFillHoles:
|
||||||
"""_fill_holes must remove all interior holes from polygons."""
|
"""_fill_holes fills small artifact holes but keeps large (real-enclosed) ones."""
|
||||||
|
|
||||||
def test_polygon_with_hole(self):
|
def test_small_artifact_hole_filled(self):
|
||||||
"""A polygon with an interior ring should become a solid polygon."""
|
"""A small (<1000 m²) interior ring is an artifact and gets filled."""
|
||||||
outer = [(0, 0), (100, 0), (100, 100), (0, 100), (0, 0)]
|
outer = [(0, 0), (100, 0), (100, 100), (0, 100), (0, 0)]
|
||||||
hole = [(30, 30), (70, 30), (70, 70), (30, 70), (30, 30)]
|
hole = [(40, 40), (60, 40), (60, 60), (40, 60), (40, 40)] # 20x20 = 400 m²
|
||||||
poly_with_hole = Polygon(outer, [hole])
|
poly_with_hole = Polygon(outer, [hole])
|
||||||
assert len(list(poly_with_hole.interiors)) == 1
|
assert len(list(poly_with_hole.interiors)) == 1
|
||||||
result = _fill_holes(poly_with_hole)
|
result = _fill_holes(poly_with_hole)
|
||||||
|
|
@ -776,6 +871,15 @@ class TestFillHoles:
|
||||||
assert len(list(result.interiors)) == 0
|
assert len(list(result.interiors)) == 0
|
||||||
assert result.area == pytest.approx(Polygon(outer).area)
|
assert result.area == pytest.approx(Polygon(outer).area)
|
||||||
|
|
||||||
|
def test_large_hole_kept(self):
|
||||||
|
"""A large (>=1000 m²) hole is likely a real enclosed postcode — keep it."""
|
||||||
|
outer = [(0, 0), (100, 0), (100, 100), (0, 100), (0, 0)]
|
||||||
|
hole = [(20, 20), (80, 20), (80, 80), (20, 80), (20, 20)] # 60x60 = 3600 m²
|
||||||
|
poly_with_hole = Polygon(outer, [hole])
|
||||||
|
result = _fill_holes(poly_with_hole)
|
||||||
|
assert len(list(result.interiors)) == 1
|
||||||
|
assert result.area == pytest.approx(10000 - 3600)
|
||||||
|
|
||||||
def test_multipolygon_with_holes(self):
|
def test_multipolygon_with_holes(self):
|
||||||
"""A MultiPolygon where each part has holes should have all holes removed."""
|
"""A MultiPolygon where each part has holes should have all holes removed."""
|
||||||
outer1 = [(0, 0), (50, 0), (50, 50), (0, 50), (0, 0)]
|
outer1 = [(0, 0), (50, 0), (50, 50), (0, 50), (0, 0)]
|
||||||
|
|
@ -944,3 +1048,356 @@ class TestGreenspaceHolePreserved:
|
||||||
merged = result["TEST1"]
|
merged = result["TEST1"]
|
||||||
assert len(list(merged.interiors)) == 1
|
assert len(list(merged.interiors)) == 1
|
||||||
assert merged.area == pytest.approx(10000 - 1600, rel=0.05)
|
assert merged.area == pytest.approx(10000 - 1600, rel=0.05)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# merge_fragments keeps substantial detached parts (no OA-seam coverage gaps)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestKeepDetachedParts:
|
||||||
|
"""A postcode split across an OA seam (railway/river) must keep both parts
|
||||||
|
instead of dropping all but the largest, which left ~1.8% uncovered gaps."""
|
||||||
|
|
||||||
|
def test_far_apart_parts_both_kept(self):
|
||||||
|
# Two 50x50m blocks 30m apart — wider than the 10m merge buffer.
|
||||||
|
a = box(0, 0, 50, 50) # 2500 m²
|
||||||
|
b = box(80, 0, 130, 50) # 2500 m², 30m gap
|
||||||
|
geom = merge_fragments([("AA1 1AA", a), ("AA1 1AA", b)])["AA1 1AA"]
|
||||||
|
assert geom.geom_type == "MultiPolygon"
|
||||||
|
assert len(geom.geoms) == 2
|
||||||
|
assert geom.area == pytest.approx(5000, rel=0.01)
|
||||||
|
|
||||||
|
def test_tiny_noise_part_dropped(self):
|
||||||
|
main = box(0, 0, 100, 100) # 10000 m²
|
||||||
|
noise = box(200, 200, 205, 205) # 25 m² < 100 m² threshold
|
||||||
|
geom = merge_fragments([("AA1 1AA", main), ("AA1 1AA", noise)])["AA1 1AA"]
|
||||||
|
assert geom.geom_type == "Polygon"
|
||||||
|
assert geom.area == pytest.approx(10000, rel=0.01)
|
||||||
|
|
||||||
|
|
||||||
|
class TestMultiPolygonOutput:
|
||||||
|
"""to_wgs84_geojson_multi / the writer must emit MultiPolygon for split
|
||||||
|
postcodes (the Rust server + loader already parse MultiPolygon)."""
|
||||||
|
|
||||||
|
def test_multipolygon_preserves_all_parts(self):
|
||||||
|
from shapely.geometry import shape
|
||||||
|
|
||||||
|
mp = MultiPolygon(
|
||||||
|
[
|
||||||
|
box(530000, 180000, 530100, 180100),
|
||||||
|
box(531000, 180000, 531100, 180100),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
gj = to_wgs84_geojson_multi(mp)
|
||||||
|
assert gj["type"] == "MultiPolygon"
|
||||||
|
assert len(gj["coordinates"]) == 2
|
||||||
|
rt = shape(gj)
|
||||||
|
assert rt.is_valid and not rt.is_empty
|
||||||
|
assert len(rt.geoms) == 2
|
||||||
|
|
||||||
|
def test_single_part_stays_polygon(self):
|
||||||
|
gj = to_wgs84_geojson_multi(box(530000, 180000, 530100, 180100))
|
||||||
|
assert gj["type"] == "Polygon"
|
||||||
|
|
||||||
|
def test_writer_emits_multipolygon_feature(self, tmp_path):
|
||||||
|
mp = MultiPolygon(
|
||||||
|
[
|
||||||
|
box(530000, 180000, 530100, 180100),
|
||||||
|
box(531000, 180000, 531100, 180100),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert write_district_geojson({"AA1 1AA": mp}, tmp_path) == 1
|
||||||
|
coll = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
|
||||||
|
assert coll["features"][0]["geometry"]["type"] == "MultiPolygon"
|
||||||
|
|
||||||
|
|
||||||
|
class TestOutputPartition:
|
||||||
|
"""The writer must emit a partition: overlapping postcodes are made disjoint
|
||||||
|
(no two cover the same ground) without dropping an active postcode."""
|
||||||
|
|
||||||
|
def test_overlapping_postcodes_made_disjoint(self, tmp_path):
|
||||||
|
from shapely.geometry import shape
|
||||||
|
|
||||||
|
a = box(530000, 180000, 530100, 180100)
|
||||||
|
b = box(530090, 180000, 530200, 180100) # overlaps `a` in a 10m strip
|
||||||
|
assert a.intersection(b).area > 0 # precondition: they overlap
|
||||||
|
|
||||||
|
write_district_geojson({"AA1 1AA": a, "AA1 1AB": b}, tmp_path)
|
||||||
|
coll = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
|
||||||
|
geoms = {
|
||||||
|
f["properties"]["postcodes"]: shape(f["geometry"])
|
||||||
|
for f in coll["features"]
|
||||||
|
}
|
||||||
|
assert set(geoms) == {"AA1 1AA", "AA1 1AB"} # neither dropped
|
||||||
|
# Disjoint interiors (share at most an edge).
|
||||||
|
assert geoms["AA1 1AA"].intersection(geoms["AA1 1AB"]).area == pytest.approx(
|
||||||
|
0.0, abs=1e-12
|
||||||
|
)
|
||||||
|
assert all(g.area > 0 for g in geoms.values())
|
||||||
|
|
||||||
|
def test_enclosed_postcode_makes_container_a_donut(self, tmp_path):
|
||||||
|
"""A postcode fully INSIDE another must stay disjoint: the smaller (inner)
|
||||||
|
keeps its area, the container gets a hole. A plain `overlaps` query misses
|
||||||
|
containment, so this is the regression guard for that fix."""
|
||||||
|
from shapely.geometry import shape
|
||||||
|
|
||||||
|
outer = box(530000, 180000, 530300, 180300) # 90,000 m²
|
||||||
|
inner = box(530100, 180100, 530200, 180200) # 10,000 m², fully inside outer
|
||||||
|
assert outer.contains(inner) # precondition
|
||||||
|
|
||||||
|
write_district_geojson({"AA1 1AA": outer, "AA1 1AB": inner}, tmp_path)
|
||||||
|
coll = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
|
||||||
|
geoms = {
|
||||||
|
f["properties"]["postcodes"]: shape(f["geometry"])
|
||||||
|
for f in coll["features"]
|
||||||
|
}
|
||||||
|
assert set(geoms) == {"AA1 1AA", "AA1 1AB"} # neither dropped
|
||||||
|
assert geoms["AA1 1AA"].intersection(geoms["AA1 1AB"]).area == pytest.approx(
|
||||||
|
0.0, abs=1e-12
|
||||||
|
)
|
||||||
|
# Container is now a donut around the enclosed postcode.
|
||||||
|
assert geoms["AA1 1AA"].geom_type == "Polygon"
|
||||||
|
assert len(list(geoms["AA1 1AA"].interiors)) == 1
|
||||||
|
assert geoms["AA1 1AB"].area > 0
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# InspireIndex must return the same candidates as a brute-force bbox scan
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestInspireIndex:
|
||||||
|
"""The grid index replaces a per-OA linear scan of all parcel bboxes; it must
|
||||||
|
return an identical candidate set (and order) so Phase 3 output is unchanged."""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _brute(bboxes, box):
|
||||||
|
e0, n0, e1, n1 = box
|
||||||
|
mask = (
|
||||||
|
(bboxes[:, 2] >= e0)
|
||||||
|
& (bboxes[:, 0] <= e1)
|
||||||
|
& (bboxes[:, 3] >= n0)
|
||||||
|
& (bboxes[:, 1] <= n1)
|
||||||
|
)
|
||||||
|
return np.where(mask)[0]
|
||||||
|
|
||||||
|
def test_matches_brute_force_over_random_queries(self):
|
||||||
|
rng = np.random.default_rng(0)
|
||||||
|
x = rng.uniform(0, 10000, 5000)
|
||||||
|
y = rng.uniform(0, 10000, 5000)
|
||||||
|
w = rng.uniform(1, 60, 5000) # all <= 500m cell → CSR path
|
||||||
|
h = rng.uniform(1, 60, 5000)
|
||||||
|
bboxes = np.column_stack([x, y, x + w, y + h]).astype(np.float64)
|
||||||
|
idx = build_inspire_index(bboxes, None, None, cell_size=500.0)
|
||||||
|
|
||||||
|
for _ in range(400):
|
||||||
|
cx, cy = rng.uniform(0, 10000), rng.uniform(0, 10000)
|
||||||
|
sz = float(rng.choice([30.0, 200.0, 1000.0, 3000.0]))
|
||||||
|
box = (cx, cy, cx + sz, cy + sz)
|
||||||
|
got = idx.candidate_indices(box)
|
||||||
|
expected = np.sort(self._brute(bboxes, box))
|
||||||
|
assert np.array_equal(got, expected)
|
||||||
|
|
||||||
|
def test_oversized_parcel_is_found(self):
|
||||||
|
# A parcel larger than a cell goes to the overflow list, not the grid;
|
||||||
|
# a query deep inside it (away from the small parcels) must still find it.
|
||||||
|
bboxes = np.array(
|
||||||
|
[
|
||||||
|
[0.0, 0.0, 5000.0, 5000.0], # 5km parcel >> 500m cell
|
||||||
|
[100.0, 100.0, 120.0, 120.0],
|
||||||
|
[4000.0, 4000.0, 4020.0, 4020.0],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
idx = build_inspire_index(bboxes, None, None, cell_size=500.0)
|
||||||
|
box = (2000.0, 2000.0, 2050.0, 2050.0)
|
||||||
|
got = idx.candidate_indices(box)
|
||||||
|
assert 0 in got
|
||||||
|
assert np.array_equal(got, np.sort(self._brute(bboxes, box)))
|
||||||
|
|
||||||
|
def test_no_overlap_returns_empty(self):
|
||||||
|
bboxes = np.array([[0.0, 0.0, 10.0, 10.0], [20.0, 20.0, 30.0, 30.0]])
|
||||||
|
idx = build_inspire_index(bboxes, None, None, cell_size=500.0)
|
||||||
|
assert len(idx.candidate_indices((100.0, 100.0, 110.0, 110.0))) == 0
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Parallel OA processing must match the sequential result exactly
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestParallelProcessing:
|
||||||
|
"""_process_oas across workers must produce the same fragments as workers=1.
|
||||||
|
Uses single-postcode OAs (fast path), so it exercises the chunking + WKB
|
||||||
|
round-trip + fork machinery without needing INSPIRE data."""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _inputs(n_oas=60):
|
||||||
|
import pyarrow as pa
|
||||||
|
|
||||||
|
oa_geoms = {
|
||||||
|
f"E{i:08d}": box(i * 100.0, 0.0, i * 100.0 + 50.0, 50.0)
|
||||||
|
for i in range(n_oas)
|
||||||
|
}
|
||||||
|
codes = sorted(oa_geoms)
|
||||||
|
east, north, pcs = [], [], []
|
||||||
|
offsets = {}
|
||||||
|
pos = 0
|
||||||
|
for i, code in enumerate(codes):
|
||||||
|
east += [i * 100.0 + 10.0, i * 100.0 + 20.0]
|
||||||
|
north += [10.0, 20.0]
|
||||||
|
pcs += [f"AA{i % 5} {i % 9}AA"] * 2 # one postcode per OA → fast path
|
||||||
|
offsets[code] = (pos, pos + 2)
|
||||||
|
pos += 2
|
||||||
|
return (
|
||||||
|
codes,
|
||||||
|
oa_geoms,
|
||||||
|
np.array(east),
|
||||||
|
np.array(north),
|
||||||
|
pa.array(pcs, type=pa.large_string()),
|
||||||
|
offsets,
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _norm(frags):
|
||||||
|
return sorted((pc, geom.wkb_hex) for pc, geom in frags)
|
||||||
|
|
||||||
|
def test_parallel_matches_sequential(self):
|
||||||
|
codes, oa, east, north, pcs, offs = self._inputs()
|
||||||
|
seq, s1 = _process_oas(codes, oa, east, north, pcs, offs, None, workers=1)
|
||||||
|
par, s2 = _process_oas(codes, oa, east, north, pcs, offs, None, workers=3)
|
||||||
|
assert len(seq) == len(codes) # one fragment per single-postcode OA
|
||||||
|
assert s1 == s2 == len(codes)
|
||||||
|
assert self._norm(seq) == self._norm(par)
|
||||||
|
|
||||||
|
def test_oa_failure_is_tagged_with_oa_code(self):
|
||||||
|
"""A failure inside per-OA processing must re-raise with the OA code, so a
|
||||||
|
single bad OA is attributable instead of an anonymous worker abort."""
|
||||||
|
# Missing OA in the geoms dict → KeyError, wrapped with the OA code.
|
||||||
|
with pytest.raises(RuntimeError, match="E00099999"):
|
||||||
|
_oa_fragments("E00099999", {}, None, None, None, {}, None)
|
||||||
|
|
||||||
|
|
||||||
|
class TestDegenerateGeometryHandling:
|
||||||
|
"""Every active postcode must keep a boundary (validate_outputs is strict),
|
||||||
|
so a sub-grid sliver is fattened rather than dropped. A genuinely empty
|
||||||
|
geometry is skipped without aborting the whole write (the 10h regression)."""
|
||||||
|
|
||||||
|
# Three near-collinear vertices in BNG: bbox ~28m x 7m but area ~0.04 m²,
|
||||||
|
# i.e. AL10 0TU. Without the rescue it snaps to empty at output precision.
|
||||||
|
SLIVER = Polygon(
|
||||||
|
[(523045.34, 209625.56), (523040.47, 209624.33), (523017.0, 209618.42)]
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_sliver_is_rescued_to_valid_geometry(self):
|
||||||
|
from shapely.geometry import shape
|
||||||
|
|
||||||
|
result = to_wgs84_geojson(self.SLIVER)
|
||||||
|
assert result is not None, "sliver must be rescued, not dropped"
|
||||||
|
rt = shape(result)
|
||||||
|
assert not rt.is_empty
|
||||||
|
assert rt.is_valid
|
||||||
|
|
||||||
|
def test_collinear_zero_area_input_is_rescued(self):
|
||||||
|
"""A zero-area collinear 'polygon' (can't be cleaned to a polygon) must
|
||||||
|
still be rescued via the representative-point fallback, not dropped."""
|
||||||
|
from shapely.geometry import shape
|
||||||
|
|
||||||
|
degenerate = Polygon(
|
||||||
|
[(523000, 209600), (523010, 209600), (523020, 209600), (523000, 209600)]
|
||||||
|
)
|
||||||
|
assert degenerate.area == 0.0
|
||||||
|
result = to_wgs84_geojson(degenerate)
|
||||||
|
assert result is not None, "degenerate input must be rescued, not dropped"
|
||||||
|
rt = shape(result)
|
||||||
|
assert not rt.is_empty
|
||||||
|
assert rt.is_valid
|
||||||
|
|
||||||
|
def test_sliver_postcode_present_in_output(self, tmp_path):
|
||||||
|
postcodes = {
|
||||||
|
"AA1 1AA": box(530000, 180000, 530100, 180100),
|
||||||
|
"AA1 1AB": self.SLIVER, # must survive
|
||||||
|
}
|
||||||
|
file_count = write_district_geojson(postcodes, tmp_path)
|
||||||
|
assert file_count == 1
|
||||||
|
collection = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
|
||||||
|
written = {f["properties"]["postcodes"] for f in collection["features"]}
|
||||||
|
assert written == {"AA1 1AA", "AA1 1AB"}
|
||||||
|
|
||||||
|
def test_empty_geometry_skipped_not_raised(self, tmp_path):
|
||||||
|
# The last-resort safety net: an unrescuable (empty) geometry is skipped
|
||||||
|
# so one bad postcode can never abort a multi-hour run.
|
||||||
|
postcodes = {
|
||||||
|
"AA1 1AA": box(530000, 180000, 530100, 180100),
|
||||||
|
"AA1 1AB": Polygon(), # genuinely empty
|
||||||
|
}
|
||||||
|
file_count = write_district_geojson(postcodes, tmp_path)
|
||||||
|
assert file_count == 1
|
||||||
|
collection = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
|
||||||
|
written = {f["properties"]["postcodes"] for f in collection["features"]}
|
||||||
|
assert written == {"AA1 1AA"}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# fragments_cache round-trips Phase 3 output and validates freshness
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestFragmentsCache:
|
||||||
|
"""Persisting Phase 3 lets a crashed run resume without the ~10h OA loop."""
|
||||||
|
|
||||||
|
def test_round_trip_preserves_postcodes_and_geometry(self, tmp_path):
|
||||||
|
fragments = [
|
||||||
|
("AA1 1AA", box(0, 0, 100, 100)),
|
||||||
|
("AA1 1AB", box(200, 200, 250, 260)),
|
||||||
|
# A postcode spanning multiple OAs appears as repeated entries.
|
||||||
|
("AA1 1AA", box(100, 0, 150, 100)),
|
||||||
|
("AA1 1AC", MultiPolygon([box(0, 0, 10, 10), box(20, 20, 30, 30)])),
|
||||||
|
]
|
||||||
|
cache = tmp_path / "fragments_cache.parquet"
|
||||||
|
save_fragments(cache, fragments)
|
||||||
|
loaded = load_fragments(cache)
|
||||||
|
|
||||||
|
assert [pc for pc, _ in loaded] == [pc for pc, _ in fragments]
|
||||||
|
for (_, original), (_, restored) in zip(fragments, loaded):
|
||||||
|
assert restored.equals(original)
|
||||||
|
|
||||||
|
def test_save_is_atomic_no_tmp_left_behind(self, tmp_path):
|
||||||
|
cache = tmp_path / "fragments_cache.parquet"
|
||||||
|
save_fragments(cache, [("AA1 1AA", box(0, 0, 1, 1))])
|
||||||
|
assert cache.exists()
|
||||||
|
assert not (tmp_path / "fragments_cache.parquet.tmp").exists()
|
||||||
|
|
||||||
|
def test_missing_cache_is_not_fresh(self, tmp_path):
|
||||||
|
cache = tmp_path / "fragments_cache.parquet"
|
||||||
|
inp = tmp_path / "uprn.parquet"
|
||||||
|
inp.write_text("x")
|
||||||
|
assert fragments_cache_is_fresh(cache, [inp]) is False
|
||||||
|
|
||||||
|
def test_cache_newer_than_inputs_is_fresh(self, tmp_path):
|
||||||
|
import os
|
||||||
|
|
||||||
|
inp = tmp_path / "uprn.parquet"
|
||||||
|
inp.write_text("x")
|
||||||
|
cache = tmp_path / "fragments_cache.parquet"
|
||||||
|
cache.write_text("c")
|
||||||
|
os.utime(inp, (1_000, 1_000))
|
||||||
|
os.utime(cache, (2_000, 2_000))
|
||||||
|
assert fragments_cache_is_fresh(cache, [inp, None]) is True
|
||||||
|
|
||||||
|
def test_cache_older_than_any_input_is_stale(self, tmp_path):
|
||||||
|
import os
|
||||||
|
|
||||||
|
inp = tmp_path / "oa.gpkg"
|
||||||
|
inp.write_text("x")
|
||||||
|
cache = tmp_path / "fragments_cache.parquet"
|
||||||
|
cache.write_text("c")
|
||||||
|
os.utime(cache, (1_000, 1_000))
|
||||||
|
os.utime(inp, (2_000, 2_000)) # input touched after the cache
|
||||||
|
assert fragments_cache_is_fresh(cache, [inp]) is False
|
||||||
|
|
||||||
|
def test_missing_input_is_ignored(self, tmp_path):
|
||||||
|
cache = tmp_path / "fragments_cache.parquet"
|
||||||
|
cache.write_text("c")
|
||||||
|
# arcgis is optional/absent — it cannot have invalidated the cache.
|
||||||
|
assert fragments_cache_is_fresh(cache, [tmp_path / "absent.parquet"]) is True
|
||||||
|
|
|
||||||
|
|
@ -79,13 +79,42 @@ def load_uprns(
|
||||||
)
|
)
|
||||||
|
|
||||||
if mapping is not None and mapping.height > 0:
|
if mapping is not None and mapping.height > 0:
|
||||||
uprns = (
|
# Remap terminated postcodes to their nearest active successor. The
|
||||||
uprns.join(
|
# successor generally lives in a DIFFERENT OA (and at different grid
|
||||||
|
# coordinates), so the remapped point must adopt the successor's
|
||||||
|
# authoritative OA/coords — keeping the terminated postcode's original
|
||||||
|
# OA would seed the successor into an OA it doesn't belong to, splitting
|
||||||
|
# its boundary across OAs. Genuine (non-remapped) UPRN rows keep their
|
||||||
|
# own OA, since a live postcode can legitimately span several OAs.
|
||||||
|
uprns = uprns.join(
|
||||||
mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left"
|
mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left"
|
||||||
|
).with_columns(pl.col("new_postcode").is_not_null().alias("_remapped"))
|
||||||
|
if active_postcode_points is not None:
|
||||||
|
successor_oa = active_postcode_points.rename(
|
||||||
|
{
|
||||||
|
"PCDS": "new_postcode",
|
||||||
|
"GRIDGB1E": "_succ_e",
|
||||||
|
"GRIDGB1N": "_succ_n",
|
||||||
|
"OA21CD": "_succ_oa",
|
||||||
|
}
|
||||||
)
|
)
|
||||||
.with_columns(pl.coalesce("new_postcode", "PCDS").alias("PCDS"))
|
uprns = uprns.join(successor_oa, on="new_postcode", how="left").with_columns(
|
||||||
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
|
pl.when("_remapped")
|
||||||
|
.then(pl.col("_succ_e"))
|
||||||
|
.otherwise(pl.col("GRIDGB1E"))
|
||||||
|
.alias("GRIDGB1E"),
|
||||||
|
pl.when("_remapped")
|
||||||
|
.then(pl.col("_succ_n"))
|
||||||
|
.otherwise(pl.col("GRIDGB1N"))
|
||||||
|
.alias("GRIDGB1N"),
|
||||||
|
pl.when("_remapped")
|
||||||
|
.then(pl.col("_succ_oa"))
|
||||||
|
.otherwise(pl.col("OA21CD"))
|
||||||
|
.alias("OA21CD"),
|
||||||
)
|
)
|
||||||
|
uprns = uprns.with_columns(
|
||||||
|
pl.coalesce("new_postcode", "PCDS").alias("PCDS")
|
||||||
|
).select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
|
||||||
|
|
||||||
if active_postcode_points is not None:
|
if active_postcode_points is not None:
|
||||||
active_postcodes = active_postcode_points.select("PCDS").unique()
|
active_postcodes = active_postcode_points.select("PCDS").unique()
|
||||||
|
|
@ -149,3 +178,37 @@ def get_oa_uprns(
|
||||||
)
|
)
|
||||||
postcodes = sub["PCDS"].to_list()
|
postcodes = sub["PCDS"].to_list()
|
||||||
return points, postcodes
|
return points, postcodes
|
||||||
|
|
||||||
|
|
||||||
|
def extract_uprn_arrays(df: pl.DataFrame):
|
||||||
|
"""Convert the UPRN DataFrame to fork-shareable numpy/Arrow arrays.
|
||||||
|
|
||||||
|
Returns ``(east, north, postcodes)``: two float64 ndarrays and a contiguous
|
||||||
|
pyarrow string Array. Multiprocessing workers slice these per OA via
|
||||||
|
:func:`get_oa_uprns_arrays` **without touching polars**, which avoids the
|
||||||
|
fork-after-threads deadlock hazard of polars' rayon pool. Being plain
|
||||||
|
numpy/Arrow buffers (not millions of Python objects), they are shared by
|
||||||
|
``fork`` copy-on-write rather than duplicated ~1GB per worker.
|
||||||
|
"""
|
||||||
|
import pyarrow as pa
|
||||||
|
|
||||||
|
east = np.ascontiguousarray(df["GRIDGB1E"].to_numpy(), dtype=np.float64)
|
||||||
|
north = np.ascontiguousarray(df["GRIDGB1N"].to_numpy(), dtype=np.float64)
|
||||||
|
postcodes = df["PCDS"].to_arrow()
|
||||||
|
if isinstance(postcodes, pa.ChunkedArray):
|
||||||
|
postcodes = postcodes.combine_chunks()
|
||||||
|
return east, north, postcodes
|
||||||
|
|
||||||
|
|
||||||
|
def get_oa_uprns_arrays(
|
||||||
|
east: np.ndarray,
|
||||||
|
north: np.ndarray,
|
||||||
|
postcodes,
|
||||||
|
offsets: dict[str, tuple[int, int]],
|
||||||
|
oa_code: str,
|
||||||
|
) -> tuple[np.ndarray, list[str]]:
|
||||||
|
"""Like :func:`get_oa_uprns`, but slices the fork-shareable arrays from
|
||||||
|
:func:`extract_uprn_arrays` (no polars), so it is safe to call in workers."""
|
||||||
|
s, e = offsets[oa_code]
|
||||||
|
points = np.column_stack([east[s:e], north[s:e]])
|
||||||
|
return points, postcodes.slice(s, e - s).to_pylist()
|
||||||
|
|
|
||||||
|
|
@ -11,9 +11,9 @@ from pathlib import Path
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
|
from pipeline.transform.price_estimation.estimate import guarded_blend_estimates
|
||||||
from pipeline.transform.price_estimation.index import build_index
|
from pipeline.transform.price_estimation.index import build_index
|
||||||
from pipeline.transform.price_estimation.knn import (
|
from pipeline.transform.price_estimation.knn import (
|
||||||
KNN_BLEND_WEIGHT,
|
|
||||||
build_knn_pool,
|
build_knn_pool,
|
||||||
knn_median_psm,
|
knn_median_psm,
|
||||||
)
|
)
|
||||||
|
|
@ -115,7 +115,10 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
|
||||||
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
|
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
|
||||||
.exp()
|
.exp()
|
||||||
)
|
)
|
||||||
.fill_null(pl.col("input_price").cast(pl.Float64))
|
# Keep null when the index can't be interpolated, matching production
|
||||||
|
# (estimate.py ships null there). compute_metrics filters to finite
|
||||||
|
# positive predictions, so these rows correctly drop from the Index n
|
||||||
|
# rather than silently degrading to the Naive prediction.
|
||||||
.alias("predicted"),
|
.alias("predicted"),
|
||||||
)
|
)
|
||||||
return test
|
return test
|
||||||
|
|
@ -265,13 +268,12 @@ def main():
|
||||||
f" kNN estimates: {n_knn:,} of {len(test):,} ({n_knn / len(test) * 100:.1f}%)"
|
f" kNN estimates: {n_knn:,} of {len(test):,} ({n_knn / len(test) * 100:.1f}%)"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Blend: (1-w)*index + w*kNN where both available
|
# Blend with the exact shipped estimator (stability gate + last-price cap +
|
||||||
|
# null-when-no-index) so the "Blended" stage reflects production accuracy.
|
||||||
|
# input_price is the backtest equivalent of production's "Last known price".
|
||||||
index_est = test["predicted"].to_numpy().astype(np.float64)
|
index_est = test["predicted"].to_numpy().astype(np.float64)
|
||||||
knn_valid = np.isfinite(knn_est) & (knn_est > 0)
|
blended = guarded_blend_estimates(
|
||||||
blended = np.where(
|
index_est, knn_est, test["input_price"].cast(pl.Float64).to_numpy()
|
||||||
knn_valid & np.isfinite(index_est),
|
|
||||||
(1 - KNN_BLEND_WEIGHT) * index_est + KNN_BLEND_WEIGHT * knn_est,
|
|
||||||
np.where(np.isfinite(index_est), index_est, knn_est),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
actual = test["actual_price"].to_numpy().astype(np.float64)
|
actual = test["actual_price"].to_numpy().astype(np.float64)
|
||||||
|
|
|
||||||
|
|
@ -19,6 +19,8 @@ from tqdm import tqdm
|
||||||
from pipeline.transform.price_estimation.shrinkage import (
|
from pipeline.transform.price_estimation.shrinkage import (
|
||||||
blend_dicts,
|
blend_dicts,
|
||||||
hierarchical_shrinkage,
|
hierarchical_shrinkage,
|
||||||
|
reanchor_dict,
|
||||||
|
reanchor_dicts,
|
||||||
shrink_dicts,
|
shrink_dicts,
|
||||||
spatial_smooth,
|
spatial_smooth,
|
||||||
)
|
)
|
||||||
|
|
@ -431,6 +433,17 @@ def build_index(
|
||||||
f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
|
f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Re-anchor every repeat-sales dict to the global base year before any
|
||||||
|
# shrinkage/smoothing/blending. solve_robust_index anchors each cell to
|
||||||
|
# log-index 0 at its OWN earliest year, so cells with shorter histories
|
||||||
|
# are measured from a later origin; combining them key-by-key would
|
||||||
|
# otherwise average level-incompatible numbers. The hedonic fallback is
|
||||||
|
# already anchored at min_year, so we align everything to min_year.
|
||||||
|
national_idx = reanchor_dict(national_idx, min_year)
|
||||||
|
area_idx = reanchor_dicts(area_idx, min_year)
|
||||||
|
district_idx = reanchor_dicts(district_idx, min_year)
|
||||||
|
sector_idx = reanchor_dicts(sector_idx, min_year)
|
||||||
|
|
||||||
# Shrinkage: national -> hedonic first, then hierarchical
|
# Shrinkage: national -> hedonic first, then hierarchical
|
||||||
print(" Applying shrinkage...")
|
print(" Applying shrinkage...")
|
||||||
national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)
|
national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,51 @@ SPATIAL_NEIGHBORS = 5
|
||||||
SPATIAL_BLEND_K = 30
|
SPATIAL_BLEND_K = 30
|
||||||
|
|
||||||
|
|
||||||
|
def _base_value(index: dict[int, float], base_year: int) -> float:
|
||||||
|
"""Value of an index dict at `base_year`, with forward/back-fill for gaps.
|
||||||
|
|
||||||
|
Each repeat-sales dict is anchored to 0 at its OWN earliest year, so its
|
||||||
|
values are log-levels relative to that origin. To express it on a common
|
||||||
|
origin we need its value at the shared `base_year`:
|
||||||
|
- exact hit: use it directly;
|
||||||
|
- base_year before the dict's history: back-fill, i.e. the earliest known
|
||||||
|
value (which is 0.0 by construction). We cannot observe the level move
|
||||||
|
between the global base and a later-starting cell, so we assume none,
|
||||||
|
matching forward_fill's back-fill convention;
|
||||||
|
- base_year inside a gap / after history: forward-fill the most recent
|
||||||
|
prior value.
|
||||||
|
"""
|
||||||
|
if base_year in index:
|
||||||
|
return index[base_year]
|
||||||
|
years = sorted(index)
|
||||||
|
if not years or base_year < years[0]:
|
||||||
|
return index[years[0]] if years else 0.0
|
||||||
|
prior = [y for y in years if y <= base_year]
|
||||||
|
return index[prior[-1]]
|
||||||
|
|
||||||
|
|
||||||
|
def reanchor_dict(index: dict[int, float], base_year: int) -> dict[int, float]:
|
||||||
|
"""Re-anchor an index dict so index[base_year] == 0 (pure constant shift).
|
||||||
|
|
||||||
|
Subtracting the same constant from every year preserves all within-dict
|
||||||
|
year-to-year differences, so estimate.py's (current - sale) semantics are
|
||||||
|
unchanged; it only fixes the cross-dict level mismatch before blending.
|
||||||
|
"""
|
||||||
|
if not index:
|
||||||
|
return index
|
||||||
|
shift = _base_value(index, base_year)
|
||||||
|
if shift == 0.0:
|
||||||
|
return index
|
||||||
|
return {y: v - shift for y, v in index.items()}
|
||||||
|
|
||||||
|
|
||||||
|
def reanchor_dicts(
|
||||||
|
indices: dict[str, dict[int, float]], base_year: int
|
||||||
|
) -> dict[str, dict[int, float]]:
|
||||||
|
"""Re-anchor every index dict in a mapping to the common `base_year`."""
|
||||||
|
return {key: reanchor_dict(idx, base_year) for key, idx in indices.items()}
|
||||||
|
|
||||||
|
|
||||||
def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
|
def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
|
||||||
"""Shrink dict values toward parent using n/(n+k) weighting.
|
"""Shrink dict values toward parent using n/(n+k) weighting.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -158,6 +158,53 @@ def test_transform_crime_writes_by_year_output(tmp_path):
|
||||||
assert serious[2024] == 12.0
|
assert serious[2024] == 12.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_transform_crime_headline_is_mean_of_per_year_bars(tmp_path):
|
||||||
|
"""The avg/yr headline must equal the average of the by-year chart bars, i.e.
|
||||||
|
the simple mean of each year's annualised count -- NOT a month-weighted pooled
|
||||||
|
rate. They diverge when years have uneven partial-month coverage."""
|
||||||
|
crime_dir = tmp_path / "crime"
|
||||||
|
jan23 = crime_dir / "2023-01"
|
||||||
|
jan24 = crime_dir / "2024-01"
|
||||||
|
feb24 = crime_dir / "2024-02"
|
||||||
|
for d in (jan23, jan24, feb24):
|
||||||
|
d.mkdir(parents=True)
|
||||||
|
|
||||||
|
header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
|
||||||
|
# 2023: 6 burglaries in 1 month -> 6 * 12 / 1 = 72/yr.
|
||||||
|
(jan23 / "2023-01-test-force-street.csv").write_text(
|
||||||
|
"\n".join(
|
||||||
|
[header]
|
||||||
|
+ [
|
||||||
|
f"{i},2023-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,"
|
||||||
|
for i in range(1, 7)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
+ "\n"
|
||||||
|
)
|
||||||
|
# 2024: 2 burglaries across 2 months -> 2 * 12 / 2 = 12/yr.
|
||||||
|
(jan24 / "2024-01-test-force-street.csv").write_text(
|
||||||
|
"\n".join([header, "7,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,"]) + "\n"
|
||||||
|
)
|
||||||
|
(feb24 / "2024-02-test-force-street.csv").write_text(
|
||||||
|
"\n".join([header, "8,2024-02,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,"]) + "\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
output = tmp_path / "crime.parquet"
|
||||||
|
by_year_output = tmp_path / "crime_by_year.parquet"
|
||||||
|
transform_crime(crime_dir, output, by_year_output)
|
||||||
|
|
||||||
|
# Mean of per-year bars = (72 + 12) / 2 = 42.0.
|
||||||
|
# The old pooled rate (8 incidents / 3 months * 12 = 32.0) would be wrong.
|
||||||
|
avg = pl.read_parquet(output).to_dicts()[0]
|
||||||
|
assert avg["Burglary (avg/yr)"] == 42.0
|
||||||
|
|
||||||
|
by_year = pl.read_parquet(by_year_output).row(0, named=True)
|
||||||
|
burglary = {p["year"]: p["count"] for p in by_year["Burglary (by year)"]}
|
||||||
|
assert burglary == {2023: 72.0, 2024: 12.0}
|
||||||
|
# Headline equals the mean of the bars it summarises.
|
||||||
|
assert avg["Burglary (avg/yr)"] == sum(burglary.values()) / len(burglary)
|
||||||
|
|
||||||
|
|
||||||
def test_transform_crime_fails_without_valid_months(tmp_path):
|
def test_transform_crime_fails_without_valid_months(tmp_path):
|
||||||
crime_dir = tmp_path / "crime"
|
crime_dir = tmp_path / "crime"
|
||||||
month_dir = crime_dir / "2024-01"
|
month_dir = crime_dir / "2024-01"
|
||||||
|
|
|
||||||
|
|
@ -252,6 +252,63 @@ def test_avg_yr_is_simple_mean_of_year_bars(tmp_path):
|
||||||
assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)}
|
assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)}
|
||||||
|
|
||||||
|
|
||||||
|
def test_avg_yr_denominator_is_per_postcode_not_global(tmp_path):
|
||||||
|
# P (AB1 1AA) has burglaries only in its single most-recent year (2024); Q
|
||||||
|
# (AB1 1AB), far away, has a burglary in 2014. The type therefore spans TWO
|
||||||
|
# distinct years across all postcodes, but only ONE year for P. The headline
|
||||||
|
# must divide by P's own years-present (1), equalling its single by-year bar
|
||||||
|
# (24/yr) -- not by the global span (2), which would deflate it to 12/yr.
|
||||||
|
# The two squares are equal-area, so area normalisation leaves counts as-is.
|
||||||
|
units = tmp_path / "units"
|
||||||
|
_write_boundaries(
|
||||||
|
units,
|
||||||
|
{
|
||||||
|
"AB1": [
|
||||||
|
_square_feature("AB1 1AA", 1000, 1000, 1010, 1010),
|
||||||
|
_square_feature("AB1 1AB", 5000, 5000, 5010, 5010),
|
||||||
|
]
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
crime = tmp_path / "crime"
|
||||||
|
# P: 2 burglaries in a single 2024 month -> 24/yr bar, present in 1 year.
|
||||||
|
_write_month(
|
||||||
|
crime,
|
||||||
|
"2024-01",
|
||||||
|
[
|
||||||
|
_crime_row("2024-01", 1005, 1005, "Burglary"),
|
||||||
|
_crime_row("2024-01", 1005, 1005, "Burglary"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
# Q: 1 burglary in a far-back 2014 month -> widens the type's global span to
|
||||||
|
# two years without adding any incident to P.
|
||||||
|
_write_month(crime, "2014-01", [_crime_row("2014-01", 5005, 5005, "Burglary")])
|
||||||
|
|
||||||
|
output = tmp_path / "crime_by_postcode.parquet"
|
||||||
|
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
||||||
|
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
|
||||||
|
|
||||||
|
rows = {r["postcode"]: r for r in pl.read_parquet(output).to_dicts()}
|
||||||
|
by_year_rows = {
|
||||||
|
r["postcode"]: r for r in pl.read_parquet(by_year).to_dicts()
|
||||||
|
}
|
||||||
|
|
||||||
|
# P's headline equals the simple mean of its own bars (just the 2024 bar).
|
||||||
|
p_bars = {p["year"]: p["count"] for p in by_year_rows["AB1 1AA"]["Burglary (by year)"]}
|
||||||
|
assert p_bars == {2024: pytest.approx(24.0, abs=0.05)}
|
||||||
|
# Per-postcode denominator (1) -> 24.0. The old global denominator (2 years
|
||||||
|
# across all postcodes) would have deflated this to 12.0.
|
||||||
|
assert rows["AB1 1AA"]["Burglary (avg/yr)"] == pytest.approx(24.0, abs=0.05)
|
||||||
|
assert rows["AB1 1AA"]["Burglary (avg/yr)"] == pytest.approx(
|
||||||
|
sum(p_bars.values()) / len(p_bars), abs=0.05
|
||||||
|
)
|
||||||
|
|
||||||
|
# Q likewise: its sole 2014 bar -> 12/yr, divided by its own 1 year = 12.0.
|
||||||
|
q_bars = {p["year"]: p["count"] for p in by_year_rows["AB1 1AB"]["Burglary (by year)"]}
|
||||||
|
assert q_bars == {2014: pytest.approx(12.0, abs=0.05)}
|
||||||
|
assert rows["AB1 1AB"]["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)
|
||||||
|
|
||||||
|
|
||||||
def test_unknown_crime_type_is_dropped_with_warning(tmp_path, capsys):
|
def test_unknown_crime_type_is_dropped_with_warning(tmp_path, capsys):
|
||||||
units = tmp_path / "units"
|
units = tmp_path / "units"
|
||||||
_write_boundaries(
|
_write_boundaries(
|
||||||
|
|
|
||||||
|
|
@ -149,6 +149,7 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
|
||||||
"town_city": ["Exampletown", "Exampletown"],
|
"town_city": ["Exampletown", "Exampletown"],
|
||||||
"duration": ["F", "F"],
|
"duration": ["F", "F"],
|
||||||
"old_new": ["N", "N"],
|
"old_new": ["N", "N"],
|
||||||
|
"ppd_category": ["A", "A"],
|
||||||
}
|
}
|
||||||
).write_parquet(price_paid_path)
|
).write_parquet(price_paid_path)
|
||||||
|
|
||||||
|
|
@ -201,6 +202,7 @@ def test_run_excludes_price_paid_rows_without_full_postcode(tmp_path: Path):
|
||||||
"town_city": ["Exampletown", "Exampletown"],
|
"town_city": ["Exampletown", "Exampletown"],
|
||||||
"duration": ["F", "F"],
|
"duration": ["F", "F"],
|
||||||
"old_new": ["N", "N"],
|
"old_new": ["N", "N"],
|
||||||
|
"ppd_category": ["A", "A"],
|
||||||
}
|
}
|
||||||
).write_parquet(price_paid_path)
|
).write_parquet(price_paid_path)
|
||||||
|
|
||||||
|
|
@ -235,6 +237,7 @@ def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path
|
||||||
"town_city": ["Exampletown"],
|
"town_city": ["Exampletown"],
|
||||||
"duration": ["F"],
|
"duration": ["F"],
|
||||||
"old_new": ["N"],
|
"old_new": ["N"],
|
||||||
|
"ppd_category": ["A"],
|
||||||
}
|
}
|
||||||
).write_parquet(price_paid_path)
|
).write_parquet(price_paid_path)
|
||||||
|
|
||||||
|
|
@ -259,6 +262,93 @@ def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_excludes_category_b_sales_from_price_aggregations(tmp_path: Path):
|
||||||
|
# Category B entries (repossessions, bulk/portfolio, power-of-sale) must not
|
||||||
|
# pollute latest_price / historical_prices, but the property still survives
|
||||||
|
# via its standard Category A sales.
|
||||||
|
zip_path = tmp_path / "domestic-csv.zip"
|
||||||
|
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
||||||
|
csv_buffer = io.StringIO()
|
||||||
|
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerow(_row())
|
||||||
|
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
||||||
|
|
||||||
|
price_paid_path = tmp_path / "price-paid.parquet"
|
||||||
|
pl.DataFrame(
|
||||||
|
{
|
||||||
|
"price": [200_000, 250_000, 5_000_000],
|
||||||
|
"date_of_transfer": [date(2020, 2, 3), date(2022, 2, 3), date(2024, 2, 3)],
|
||||||
|
"property_type": ["T", "T", "T"],
|
||||||
|
"postcode": ["AA1 1AA", "AA1 1AA", "AA1 1AA"],
|
||||||
|
"paon": ["1", "1", "1"],
|
||||||
|
"saon": [None, None, None],
|
||||||
|
"street": ["Example Street", "Example Street", "Example Street"],
|
||||||
|
"locality": [None, None, None],
|
||||||
|
"town_city": ["Exampletown", "Exampletown", "Exampletown"],
|
||||||
|
"duration": ["F", "F", "F"],
|
||||||
|
"old_new": ["N", "N", "N"],
|
||||||
|
# The latest (5M) sale is a Category B bulk/portfolio transfer.
|
||||||
|
"ppd_category": ["A", "A", "B"],
|
||||||
|
}
|
||||||
|
).write_parquet(price_paid_path)
|
||||||
|
|
||||||
|
output_path = tmp_path / "epc-pp.parquet"
|
||||||
|
_run(zip_path, price_paid_path, output_path, tmp_path)
|
||||||
|
|
||||||
|
df = pl.read_parquet(output_path)
|
||||||
|
|
||||||
|
assert df.height == 1
|
||||||
|
# Only the two Category A sales survive; the 5M Category B transfer is dropped.
|
||||||
|
assert df.get_column("latest_price").to_list() == [250_000]
|
||||||
|
assert df.get_column("historical_prices").list.len().to_list() == [2]
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_new_build_keeps_early_first_transfer_when_sub_min_price(tmp_path: Path):
|
||||||
|
# A new-build whose earliest sale is below MIN_PRICE must still take that early
|
||||||
|
# year as its EXACT construction date, while latest_price uses only the
|
||||||
|
# quality-passing (>=MIN_PRICE) sale.
|
||||||
|
zip_path = tmp_path / "domestic-csv.zip"
|
||||||
|
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
||||||
|
csv_buffer = io.StringIO()
|
||||||
|
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerow(_row())
|
||||||
|
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
||||||
|
|
||||||
|
price_paid_path = tmp_path / "price-paid.parquet"
|
||||||
|
pl.DataFrame(
|
||||||
|
{
|
||||||
|
"price": [30_000, 300_000],
|
||||||
|
"date_of_transfer": [date(2015, 2, 3), date(2022, 2, 3)],
|
||||||
|
"property_type": ["T", "T"],
|
||||||
|
"postcode": ["AA1 1AA", "AA1 1AA"],
|
||||||
|
"paon": ["1", "1"],
|
||||||
|
"saon": [None, None],
|
||||||
|
"street": ["Example Street", "Example Street"],
|
||||||
|
"locality": [None, None],
|
||||||
|
"town_city": ["Exampletown", "Exampletown"],
|
||||||
|
"duration": ["F", "F"],
|
||||||
|
"old_new": ["Y", "Y"],
|
||||||
|
"ppd_category": ["A", "A"],
|
||||||
|
}
|
||||||
|
).write_parquet(price_paid_path)
|
||||||
|
|
||||||
|
output_path = tmp_path / "epc-pp.parquet"
|
||||||
|
_run(zip_path, price_paid_path, output_path, tmp_path)
|
||||||
|
|
||||||
|
df = pl.read_parquet(output_path)
|
||||||
|
|
||||||
|
assert df.height == 1
|
||||||
|
# Construction year is the genuine earliest transfer (2015), flagged EXACT,
|
||||||
|
# even though that sale is below MIN_PRICE.
|
||||||
|
assert df.get_column("construction_age_band").to_list() == [2015]
|
||||||
|
assert df.get_column("is_construction_date_approximate").to_list() == [0]
|
||||||
|
# latest_price uses only the >=MIN_PRICE sale; the sub-MIN sale is excluded.
|
||||||
|
assert df.get_column("latest_price").to_list() == [300_000]
|
||||||
|
assert df.get_column("historical_prices").list.len().to_list() == [1]
|
||||||
|
|
||||||
|
|
||||||
def test_epc_band_to_year_uses_midpoint_and_clamps():
|
def test_epc_band_to_year_uses_midpoint_and_clamps():
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,9 @@ from pipeline.transform.merge import (
|
||||||
_active_english_postcode_area,
|
_active_english_postcode_area,
|
||||||
_build_unmatched_listing_seed_rows,
|
_build_unmatched_listing_seed_rows,
|
||||||
_canonical_postcode_expr,
|
_canonical_postcode_expr,
|
||||||
|
_coalesce_direct_epc_columns,
|
||||||
_filter_to_active_english_postcodes,
|
_filter_to_active_english_postcodes,
|
||||||
|
_join_area_side_tables,
|
||||||
_finalize_listings,
|
_finalize_listings,
|
||||||
_integrate_listings,
|
_integrate_listings,
|
||||||
_match_direct_epc,
|
_match_direct_epc,
|
||||||
|
|
@ -506,6 +508,25 @@ def test_load_listings_for_merge_canonicalises_and_exposes_overlay_columns(
|
||||||
assert loaded["_actual_lat"].to_list() == [51.5]
|
assert loaded["_actual_lat"].to_list() == [51.5]
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_listings_for_merge_uprn_key_matches_normalize_uprn(tmp_path) -> None:
|
||||||
|
# A Float UPRN (e.g. read from a NaN-bearing parquet column) must produce
|
||||||
|
# the same digits-only key as `_normalize_uprn` on the candidate side, so
|
||||||
|
# the exact UPRN match is not lost. Naively stringifying "100023336956.0"
|
||||||
|
# and stripping non-digits would yield "1000233369560" (a bogus trailing
|
||||||
|
# zero) which never collides with the candidate key "100023336956".
|
||||||
|
listings_path = tmp_path / "listings.parquet"
|
||||||
|
arcgis_path = tmp_path / "arcgis.parquet"
|
||||||
|
_sample_listings_frame().with_columns(
|
||||||
|
pl.lit(100023336956.0, dtype=pl.Float64).alias("UPRN")
|
||||||
|
).write_parquet(listings_path)
|
||||||
|
_stub_arcgis(arcgis_path)
|
||||||
|
|
||||||
|
loaded = _load_listings_for_merge(listings_path, arcgis_path)
|
||||||
|
|
||||||
|
assert loaded["_listing_uprn"].to_list() == [_normalize_uprn(100023336956.0)]
|
||||||
|
assert loaded["_listing_uprn"].to_list() == ["100023336956"]
|
||||||
|
|
||||||
|
|
||||||
def test_build_unmatched_listing_seed_rows_fills_property_shape_fields(
|
def test_build_unmatched_listing_seed_rows_fills_property_shape_fields(
|
||||||
tmp_path,
|
tmp_path,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
@ -697,6 +718,110 @@ def test_normalize_uprn_handles_types_and_floats() -> None:
|
||||||
assert _normalize_uprn(float("nan")) is None
|
assert _normalize_uprn(float("nan")) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_coalesce_direct_epc_was_council_house_prefers_yes() -> None:
|
||||||
|
# The raw property value is fill_null("No") upstream, so a plain coalesce
|
||||||
|
# would let a non-null "No" override a directly-matched listing "Yes".
|
||||||
|
# "Former council house" should fire if EITHER side says "Yes".
|
||||||
|
none_col = [None] * 5
|
||||||
|
wide = pl.LazyFrame(
|
||||||
|
{
|
||||||
|
"was_council_house": ["No", "Yes", "No", None, None],
|
||||||
|
"_direct_was_council_house": ["Yes", "No", None, "Yes", None],
|
||||||
|
# An unrelated direct-EPC column keeps the plain-coalesce behaviour.
|
||||||
|
"current_energy_rating": [None, "C", "D", None, None],
|
||||||
|
"_direct_current_energy_rating": ["B", "A", None, "E", None],
|
||||||
|
# _coalesce_direct_epc_columns coalesces every pair in
|
||||||
|
# _DIRECT_EPC_RAW_COLUMN_MAP, so the rest must be present too.
|
||||||
|
"epc_address": none_col,
|
||||||
|
"_direct_epc_address": none_col,
|
||||||
|
"potential_energy_rating": none_col,
|
||||||
|
"_direct_potential_energy_rating": none_col,
|
||||||
|
"total_floor_area": none_col,
|
||||||
|
"_direct_total_floor_area": none_col,
|
||||||
|
"number_habitable_rooms": none_col,
|
||||||
|
"_direct_number_habitable_rooms": none_col,
|
||||||
|
"floor_height": none_col,
|
||||||
|
"_direct_floor_height": none_col,
|
||||||
|
"construction_age_band": none_col,
|
||||||
|
"_direct_construction_age_band": none_col,
|
||||||
|
"is_construction_date_approximate": none_col,
|
||||||
|
"_direct_is_construction_date_approximate": none_col,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
result = _coalesce_direct_epc_columns(wide).collect()
|
||||||
|
|
||||||
|
assert result["was_council_house"].to_list() == ["Yes", "Yes", "No", "Yes", None]
|
||||||
|
# Plain coalesce (raw wins when non-null) is untouched for other columns.
|
||||||
|
assert result["current_energy_rating"].to_list() == ["B", "C", "D", "E", None]
|
||||||
|
|
||||||
|
|
||||||
|
def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
|
||||||
|
# The crime table is LEFT-joined per postcode; a postcode absent from it
|
||||||
|
# must NOT be fabricated as "zero crime" (the safest value). When every
|
||||||
|
# per-type column is null the Serious/Minor rollups must stay null.
|
||||||
|
base = pl.LazyFrame(
|
||||||
|
{
|
||||||
|
"postcode": ["AA1 1AA", "BB2 2BB"],
|
||||||
|
"lsoa21": ["E01000001", "E01000002"],
|
||||||
|
"Local Authority District code (2024)": ["E09000001", "E09000002"],
|
||||||
|
"pcon": ["E14000001", "E14000002"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
def _by_postcode(extra: dict) -> pl.LazyFrame:
|
||||||
|
return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})
|
||||||
|
|
||||||
|
# Crime is present only for AA1 1AA; BB2 2BB is absent from the table.
|
||||||
|
crime = pl.LazyFrame(
|
||||||
|
{
|
||||||
|
"postcode": ["AA1 1AA"],
|
||||||
|
"Violence and sexual offences (avg/yr)": [1.0],
|
||||||
|
"Robbery (avg/yr)": [2.0],
|
||||||
|
"Burglary (avg/yr)": [3.0],
|
||||||
|
"Possession of weapons (avg/yr)": [4.0],
|
||||||
|
"Anti-social behaviour (avg/yr)": [1.0],
|
||||||
|
"Criminal damage and arson (avg/yr)": [1.0],
|
||||||
|
"Shoplifting (avg/yr)": [1.0],
|
||||||
|
"Bicycle theft (avg/yr)": [1.0],
|
||||||
|
"Theft from the person (avg/yr)": [1.0],
|
||||||
|
"Other theft (avg/yr)": [1.0],
|
||||||
|
"Vehicle crime (avg/yr)": [1.0],
|
||||||
|
"Public order (avg/yr)": [1.0],
|
||||||
|
"Drugs (avg/yr)": [1.0],
|
||||||
|
"Other crime (avg/yr)": [1.0],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
joined = _join_area_side_tables(
|
||||||
|
base,
|
||||||
|
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
|
||||||
|
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
|
||||||
|
crime=crime,
|
||||||
|
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
|
||||||
|
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
|
||||||
|
poi_counts=_by_postcode({}),
|
||||||
|
noise=_by_postcode({}),
|
||||||
|
school_proximity=_by_postcode({}),
|
||||||
|
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
|
||||||
|
tree_density=None,
|
||||||
|
broadband=pl.LazyFrame({"bb_postcode": ["AA1 1AA", "BB2 2BB"]}),
|
||||||
|
).collect()
|
||||||
|
|
||||||
|
by_postcode = {
|
||||||
|
row["postcode"]: row
|
||||||
|
for row in joined.select(
|
||||||
|
"postcode", "serious_crime_avg_yr", "minor_crime_avg_yr"
|
||||||
|
).iter_rows(named=True)
|
||||||
|
}
|
||||||
|
# Present postcode: rollups are the component sums (1+2+3+4, 10×1).
|
||||||
|
assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 10.0
|
||||||
|
assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 10.0
|
||||||
|
# Missing postcode: rollups stay null rather than fabricating 0.0.
|
||||||
|
assert by_postcode["BB2 2BB"]["serious_crime_avg_yr"] is None
|
||||||
|
assert by_postcode["BB2 2BB"]["minor_crime_avg_yr"] is None
|
||||||
|
|
||||||
|
|
||||||
def _property_candidates(rows: list[dict]) -> pl.DataFrame:
|
def _property_candidates(rows: list[dict]) -> pl.DataFrame:
|
||||||
base = {
|
base = {
|
||||||
"postcode": "AA1 1AA",
|
"postcode": "AA1 1AA",
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,44 @@
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
from pipeline.transform.poi_proximity import (
|
from pipeline.transform.poi_proximity import (
|
||||||
|
POI_GROUPS_2KM,
|
||||||
_build_poi_category_groups,
|
_build_poi_category_groups,
|
||||||
_dynamic_poi_metric_renames,
|
_dynamic_poi_metric_renames,
|
||||||
|
_groceries_categories,
|
||||||
)
|
)
|
||||||
|
from pipeline.utils.poi_counts import count_pois_per_postcode
|
||||||
|
|
||||||
|
|
||||||
|
def test_groceries_2km_counts_geolytix_brand_categories() -> None:
|
||||||
|
"""The static groceries 2km count must include GEOLYTIX brand POIs.
|
||||||
|
|
||||||
|
GEOLYTIX stores the brand (e.g. "Tesco") in `category` with group
|
||||||
|
"Groceries" and never emits the literal "Supermarket"; matching only the
|
||||||
|
OSM strings counts the supermarket but drops the brand store.
|
||||||
|
"""
|
||||||
|
postcodes = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"postcode": ["SW1A 1AA"],
|
||||||
|
"lat": [51.5010],
|
||||||
|
"lon": [-0.1416],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
pois = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"category": ["Tesco", "Supermarket"],
|
||||||
|
"group": ["Groceries", "Groceries"],
|
||||||
|
"lat": [51.5011, 51.5012],
|
||||||
|
"lng": [-0.1417, -0.1418],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
groups_2km = {**POI_GROUPS_2KM, "groceries": _groceries_categories(pois)}
|
||||||
|
result = count_pois_per_postcode(postcodes, pois, groups=groups_2km, radius_km=2)
|
||||||
|
|
||||||
|
# Both the GEOLYTIX brand ("Tesco") and the OSM "Supermarket" must count.
|
||||||
|
# Pre-fix the static list was ["Greengrocer", "Supermarket", "Convenience
|
||||||
|
# Store"], so "Tesco" was dropped and this was 1.
|
||||||
|
assert result["groceries_2km"][0] == 2
|
||||||
|
|
||||||
|
|
||||||
def test_dynamic_poi_groups_include_requested_categories_only() -> None:
|
def test_dynamic_poi_groups_include_requested_categories_only() -> None:
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,10 @@
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
from pipeline.transform.transform_poi import transform_grocery_retail_points
|
from pipeline.transform.transform_poi import (
|
||||||
|
_load_ofsted_ratings,
|
||||||
|
_school_icon_category_expr,
|
||||||
|
transform_grocery_retail_points,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_transform_grocery_retail_points_outputs_chain_categories():
|
def test_transform_grocery_retail_points_outputs_chain_categories():
|
||||||
|
|
@ -112,6 +116,33 @@ def test_transform_grocery_retail_points_merges_cooperative_societies():
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_transform_grocery_retail_points_pools_small_coop_societies_before_cutoff():
|
||||||
|
# Each Co-op society has <5 in-England stores; only after normalising to the
|
||||||
|
# shared "Co-op" brand do they clear MIN_GROCERY_CHAIN_LOCATIONS together.
|
||||||
|
societies = [
|
||||||
|
"Central England Co-operative",
|
||||||
|
"Lincolnshire Co-operative",
|
||||||
|
"The Southern Co-operative",
|
||||||
|
"Midcounties Co-operative",
|
||||||
|
"Heart of England Co-operative",
|
||||||
|
]
|
||||||
|
raw = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"id": list(range(1, len(societies) + 1)),
|
||||||
|
"retailer": societies,
|
||||||
|
"fascia": ["The Co-operative Food"] * len(societies),
|
||||||
|
"store_name": [f"Co-op Test {i}" for i in range(1, len(societies) + 1)],
|
||||||
|
"long_wgs": [-0.141] * len(societies),
|
||||||
|
"lat_wgs": [51.515] * len(societies),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
pois = transform_grocery_retail_points(raw)
|
||||||
|
|
||||||
|
assert pois.height == len(societies)
|
||||||
|
assert pois["category"].unique().to_list() == ["Co-op"]
|
||||||
|
|
||||||
|
|
||||||
def test_transform_grocery_retail_points_accepts_base_fascias():
|
def test_transform_grocery_retail_points_accepts_base_fascias():
|
||||||
raw = pl.DataFrame(
|
raw = pl.DataFrame(
|
||||||
{
|
{
|
||||||
|
|
@ -177,3 +208,87 @@ def test_transform_grocery_retail_points_includes_unmapped_chains_with_five_loca
|
||||||
{"id": "glx-4", "category": "Tian Tian", "icon_category": "Tian Tian"},
|
{"id": "glx-4", "category": "Tian Tian", "icon_category": "Tian Tian"},
|
||||||
{"id": "glx-5", "category": "Tian Tian", "icon_category": "Tian Tian"},
|
{"id": "glx-5", "category": "Tian Tian", "icon_category": "Tian Tian"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_ofsted_ratings_falls_back_to_ungraded_outcome(tmp_path):
|
||||||
|
# URNs 1-4: graded results map straight through. URNs 5-6: no usable graded
|
||||||
|
# grade (null/"Not judged") but a good/outstanding ungraded outcome, incl.
|
||||||
|
# the "(Concerns)"/"(Improving)" suffixes. URN 7: genuinely "Not judged".
|
||||||
|
# URN 8: a real grade 3 must NOT be overridden by an ungraded outcome.
|
||||||
|
ofsted_path = tmp_path / "ofsted.parquet"
|
||||||
|
pl.DataFrame(
|
||||||
|
{
|
||||||
|
"URN": [1, 2, 3, 4, 5, 6, 7, 8],
|
||||||
|
"Latest OEIF overall effectiveness": [
|
||||||
|
"1",
|
||||||
|
"2",
|
||||||
|
"3",
|
||||||
|
"4",
|
||||||
|
None,
|
||||||
|
"Not judged",
|
||||||
|
"Not judged",
|
||||||
|
"3",
|
||||||
|
],
|
||||||
|
"Ungraded inspection overall outcome": [
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
"School remains Outstanding",
|
||||||
|
"School remains Good (Concerns)",
|
||||||
|
None,
|
||||||
|
"School remains Outstanding",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
).write_parquet(ofsted_path)
|
||||||
|
|
||||||
|
ratings = (
|
||||||
|
_load_ofsted_ratings(ofsted_path)
|
||||||
|
.collect()
|
||||||
|
.sort("urn")
|
||||||
|
.to_dicts()
|
||||||
|
)
|
||||||
|
|
||||||
|
assert ratings == [
|
||||||
|
{"urn": 1, "ofsted_rating": "Outstanding"},
|
||||||
|
{"urn": 2, "ofsted_rating": "Good"},
|
||||||
|
{"urn": 3, "ofsted_rating": "Requires improvement"},
|
||||||
|
{"urn": 4, "ofsted_rating": "Inadequate"},
|
||||||
|
{"urn": 5, "ofsted_rating": "Outstanding"},
|
||||||
|
{"urn": 6, "ofsted_rating": "Good"},
|
||||||
|
{"urn": 7, "ofsted_rating": "Not judged"},
|
||||||
|
{"urn": 8, "ofsted_rating": "Requires improvement"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_school_icon_category_handles_one_sided_age_ranges():
|
||||||
|
# gias._format_age_range emits "up to {high}", "{low}+" and "{low}–{high}".
|
||||||
|
# All three (plus null) must classify, not fall through to "School".
|
||||||
|
df = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"phase": [None, None, None, None, None],
|
||||||
|
"type_group": [None, None, None, None, None],
|
||||||
|
# "up to 5" -> nursery; "16+" -> sixth form; "3–18" -> all-through;
|
||||||
|
# "4–11" -> primary; null age_range with null phase -> "School".
|
||||||
|
"age_range": ["up to 5", "16+", "3–18", "4–11", None],
|
||||||
|
},
|
||||||
|
# Production reads these from a scanned parquet as String; an all-null
|
||||||
|
# Python list would otherwise infer the Null dtype and break .str ops.
|
||||||
|
schema_overrides={
|
||||||
|
"phase": pl.String,
|
||||||
|
"type_group": pl.String,
|
||||||
|
"age_range": pl.String,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
categories = df.select(
|
||||||
|
_school_icon_category_expr().alias("category")
|
||||||
|
)["category"].to_list()
|
||||||
|
|
||||||
|
assert categories == [
|
||||||
|
"Nursery school",
|
||||||
|
"Sixth form",
|
||||||
|
"All-through school",
|
||||||
|
"Primary school",
|
||||||
|
"School",
|
||||||
|
]
|
||||||
|
|
|
||||||
|
|
@ -1289,22 +1289,27 @@ def transform_grocery_retail_points(
|
||||||
)
|
)
|
||||||
df = df.filter(pl.Series(mask))
|
df = df.filter(pl.Series(mask))
|
||||||
|
|
||||||
eligible_retailers = (
|
# Normalise to the display brand FIRST so the ~16 Co-op society retailer
|
||||||
df.group_by("retailer")
|
# names pool into one "Co-op" before the chain-eligibility cutoff; otherwise
|
||||||
|
# small societies (<MIN_GROCERY_CHAIN_LOCATIONS stores each) get dropped.
|
||||||
|
df = df.with_columns(
|
||||||
|
pl.col("retailer")
|
||||||
|
.map_elements(normalize_grocery_retailer, return_dtype=pl.String)
|
||||||
|
.alias("category")
|
||||||
|
)
|
||||||
|
eligible_categories = (
|
||||||
|
df.group_by("category")
|
||||||
.len()
|
.len()
|
||||||
.filter(pl.col("len") >= min_chain_locations)
|
.filter(pl.col("len") >= min_chain_locations)
|
||||||
.select("retailer")
|
.select("category")
|
||||||
)
|
)
|
||||||
df = df.join(eligible_retailers, on="retailer", how="semi")
|
df = df.join(eligible_categories, on="category", how="semi")
|
||||||
|
|
||||||
return df.with_columns(
|
return df.with_columns(
|
||||||
pl.concat_str([pl.lit("glx-"), pl.col("id")]).alias("id"),
|
pl.concat_str([pl.lit("glx-"), pl.col("id")]).alias("id"),
|
||||||
pl.coalesce(["store_name", "fascia", "retailer"])
|
pl.coalesce(["store_name", "fascia", "retailer"])
|
||||||
.str.replace_all("''", "'")
|
.str.replace_all("''", "'")
|
||||||
.alias("name"),
|
.alias("name"),
|
||||||
pl.col("retailer")
|
|
||||||
.map_elements(normalize_grocery_retailer, return_dtype=pl.String)
|
|
||||||
.alias("category"),
|
|
||||||
pl.struct(["fascia", "retailer"])
|
pl.struct(["fascia", "retailer"])
|
||||||
.map_elements(
|
.map_elements(
|
||||||
lambda row: normalize_grocery_icon_category(row["fascia"], row["retailer"]),
|
lambda row: normalize_grocery_icon_category(row["fascia"], row["retailer"]),
|
||||||
|
|
@ -1338,10 +1343,16 @@ def _school_icon_category_expr() -> pl.Expr:
|
||||||
# GIAS phase mixes casing ("Middle deemed Primary" vs "Middle deemed
|
# GIAS phase mixes casing ("Middle deemed Primary" vs "Middle deemed
|
||||||
# primary") so we normalise before matching.
|
# primary") so we normalise before matching.
|
||||||
phase = pl.col("phase").str.to_lowercase()
|
phase = pl.col("phase").str.to_lowercase()
|
||||||
# age_range is "<min>–<max>" using an em-dash; both ends may be missing.
|
# gias._format_age_range emits three shapes: "<low>–<high>" (em-dash),
|
||||||
age_parts = pl.col("age_range").str.split_exact("–", 1)
|
# "up to <high>" (high-only) and "<low>+" (low-only). Extract the leading
|
||||||
min_age = age_parts.struct.field("field_0").cast(pl.Int32, strict=False)
|
# integer as low and the trailing integer as high, then suppress the wrong
|
||||||
max_age = age_parts.struct.field("field_1").cast(pl.Int32, strict=False)
|
# end for the one-sided shapes so they don't collapse to a single bound.
|
||||||
|
age = pl.col("age_range")
|
||||||
|
leading = age.str.extract(r"^\s*(\d+)", 1).cast(pl.Int32, strict=False)
|
||||||
|
trailing = age.str.extract(r"(\d+)\s*$", 1).cast(pl.Int32, strict=False)
|
||||||
|
# "up to N": no low bound; "N+": no high bound.
|
||||||
|
min_age = pl.when(age.str.starts_with("up to")).then(None).otherwise(leading)
|
||||||
|
max_age = pl.when(age.str.ends_with("+")).then(None).otherwise(trailing)
|
||||||
return (
|
return (
|
||||||
pl.when(pl.col("type_group") == "Universities")
|
pl.when(pl.col("type_group") == "Universities")
|
||||||
.then(pl.lit("University"))
|
.then(pl.lit("University"))
|
||||||
|
|
@ -1386,9 +1397,16 @@ OFSTED_OEIF_LABELS = {
|
||||||
def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
|
def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
|
||||||
"""Project the latest OEIF effectiveness grade to a human-readable label,
|
"""Project the latest OEIF effectiveness grade to a human-readable label,
|
||||||
keyed by URN so it can be joined onto the GIAS register. Grades 1-4 map to
|
keyed by URN so it can be joined onto the GIAS register. Grades 1-4 map to
|
||||||
the conventional Ofsted labels; "Not judged" (post-2025 reform schools that
|
the conventional Ofsted labels; when there is no usable graded result
|
||||||
only have a report card) is preserved verbatim; null grades drop out."""
|
(null/"Not judged", e.g. schools last seen under the post-2024 ungraded
|
||||||
|
report-card framework) we fall back to "Ungraded inspection overall outcome"
|
||||||
|
so genuinely good/outstanding schools aren't dropped — mirroring
|
||||||
|
school_proximity.classify_good_plus_schools. Remaining nulls drop out."""
|
||||||
grade_col = pl.col("Latest OEIF overall effectiveness")
|
grade_col = pl.col("Latest OEIF overall effectiveness")
|
||||||
|
# See school_proximity: the ungraded outcome carries "School remains Good"/
|
||||||
|
# "School remains Outstanding" (with optional "(Concerns)"/"(Improving)"
|
||||||
|
# suffixes) when the graded column is null/"Not judged".
|
||||||
|
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
|
||||||
label = (
|
label = (
|
||||||
pl.when(grade_col == "1")
|
pl.when(grade_col == "1")
|
||||||
.then(pl.lit(OFSTED_OEIF_LABELS["1"]))
|
.then(pl.lit(OFSTED_OEIF_LABELS["1"]))
|
||||||
|
|
@ -1398,6 +1416,10 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
|
||||||
.then(pl.lit(OFSTED_OEIF_LABELS["3"]))
|
.then(pl.lit(OFSTED_OEIF_LABELS["3"]))
|
||||||
.when(grade_col == "4")
|
.when(grade_col == "4")
|
||||||
.then(pl.lit(OFSTED_OEIF_LABELS["4"]))
|
.then(pl.lit(OFSTED_OEIF_LABELS["4"]))
|
||||||
|
.when(ungraded.str.starts_with("School remains Outstanding"))
|
||||||
|
.then(pl.lit(OFSTED_OEIF_LABELS["1"]))
|
||||||
|
.when(ungraded.str.starts_with("School remains Good"))
|
||||||
|
.then(pl.lit(OFSTED_OEIF_LABELS["2"]))
|
||||||
.when(grade_col == "Not judged")
|
.when(grade_col == "Not judged")
|
||||||
.then(pl.lit("Not judged"))
|
.then(pl.lit("Not judged"))
|
||||||
.otherwise(None)
|
.otherwise(None)
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,12 @@ from pipeline.local_temp import local_tmp_dir
|
||||||
|
|
||||||
_NUMBER_RE = re.compile(r"\d+")
|
_NUMBER_RE = re.compile(r"\d+")
|
||||||
_POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"
|
_POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"
|
||||||
MIN_FUZZY_SCORE = 60
|
# A house number is a strong disambiguator, so a numbered, number-compatible
|
||||||
|
# pair may match on a lower address-similarity score than a number-less one
|
||||||
|
# (named houses / flats by building name), which must match almost exactly to
|
||||||
|
# be trusted. Mirrors merge.py's listings convention.
|
||||||
|
MIN_FUZZY_SCORE = 82
|
||||||
|
MIN_FUZZY_SCORE_WITHOUT_NUMBERS = 90
|
||||||
|
|
||||||
|
|
||||||
def normalize_address_key(s: pl.Expr) -> pl.Expr:
|
def normalize_address_key(s: pl.Expr) -> pl.Expr:
|
||||||
|
|
@ -47,6 +52,7 @@ def fuzzy_join_on_postcode(
|
||||||
left_postcode_col: str,
|
left_postcode_col: str,
|
||||||
right_postcode_col: str,
|
right_postcode_col: str,
|
||||||
min_score: int = MIN_FUZZY_SCORE,
|
min_score: int = MIN_FUZZY_SCORE,
|
||||||
|
min_score_without_numbers: int = MIN_FUZZY_SCORE_WITHOUT_NUMBERS,
|
||||||
) -> pl.LazyFrame:
|
) -> pl.LazyFrame:
|
||||||
"""Fuzzy join two LazyFrames by matching addresses within postcode buckets.
|
"""Fuzzy join two LazyFrames by matching addresses within postcode buckets.
|
||||||
|
|
||||||
|
|
@ -120,7 +126,12 @@ def fuzzy_join_on_postcode(
|
||||||
|
|
||||||
# Build tasks for each postcode bucket
|
# Build tasks for each postcode bucket
|
||||||
tasks = [
|
tasks = [
|
||||||
(left_entries, right_by_postcode[postcode], min_score)
|
(
|
||||||
|
left_entries,
|
||||||
|
right_by_postcode[postcode],
|
||||||
|
min_score,
|
||||||
|
min_score_without_numbers,
|
||||||
|
)
|
||||||
for postcode, left_entries in left_by_postcode.items()
|
for postcode, left_entries in left_by_postcode.items()
|
||||||
if postcode in right_by_postcode
|
if postcode in right_by_postcode
|
||||||
]
|
]
|
||||||
|
|
@ -201,16 +212,23 @@ def _numbers_compatible(a: str, b: str) -> bool:
|
||||||
|
|
||||||
|
|
||||||
def _score_bucket(
|
def _score_bucket(
|
||||||
args: tuple[list[tuple[int, str]], list[tuple[int, str]], int],
|
args: tuple[list[tuple[int, str]], list[tuple[int, str]], int, int],
|
||||||
) -> list[tuple[int, int, int]]:
|
) -> list[tuple[int, int, int]]:
|
||||||
"""Score all address pairs within a single postcode bucket."""
|
"""Score all address pairs within a single postcode bucket."""
|
||||||
left_entries, right_entries, min_score = args
|
left_entries, right_entries, min_score, min_score_without_numbers = args
|
||||||
pairs = []
|
pairs = []
|
||||||
for left_row, left_address in left_entries:
|
for left_row, left_address in left_entries:
|
||||||
for right_row, right_address in right_entries:
|
for right_row, right_address in right_entries:
|
||||||
if not _numbers_compatible(left_address, right_address):
|
if not _numbers_compatible(left_address, right_address):
|
||||||
continue
|
continue
|
||||||
score = fuzz.token_sort_ratio(left_address, right_address)
|
score = fuzz.token_sort_ratio(left_address, right_address)
|
||||||
if score >= min_score:
|
# Number-less pairs (named houses, building-name flats) lack the
|
||||||
|
# house-number disambiguator, so require a near-exact match.
|
||||||
|
threshold = (
|
||||||
|
min_score
|
||||||
|
if _NUMBER_RE.search(left_address) or _NUMBER_RE.search(right_address)
|
||||||
|
else min_score_without_numbers
|
||||||
|
)
|
||||||
|
if score >= threshold:
|
||||||
pairs.append((score, left_row, right_row))
|
pairs.append((score, left_row, right_row))
|
||||||
return pairs
|
return pairs
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,16 @@ import numpy as np
|
||||||
import polars as pl
|
import polars as pl
|
||||||
from scipy.spatial import cKDTree
|
from scipy.spatial import cKDTree
|
||||||
|
|
||||||
|
# Maximum distance (in OS National Grid metres) a terminated postcode may be from its
|
||||||
|
# nearest active successor to be remapped. Beyond this we treat the postcode as having no
|
||||||
|
# legitimate successor (e.g. demolished/redeveloped land) rather than re-homing it onto a
|
||||||
|
# geometrically-nearest-but-unrelated postcode on a different street/estate/LSOA, which
|
||||||
|
# would pollute the successor's crime/deprivation/school/noise/rent and price stats.
|
||||||
|
# 1km is conservative: it keeps legitimate adjacent remaps while dropping gross
|
||||||
|
# misattributions; dropped postcodes keep their terminated code and fall out at the
|
||||||
|
# active-postcode filter downstream (the honest outcome confirmed by the merge audit).
|
||||||
|
MAX_REMAP_DISTANCE_M = 1000.0
|
||||||
|
|
||||||
|
|
||||||
def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
|
def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
|
||||||
"""Build a mapping from terminated England postcodes to their nearest active postcode.
|
"""Build a mapping from terminated England postcodes to their nearest active postcode.
|
||||||
|
|
@ -50,18 +60,30 @@ def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
|
||||||
)
|
)
|
||||||
|
|
||||||
tree = cKDTree(active_coords)
|
tree = cKDTree(active_coords)
|
||||||
distances, indices = tree.query(terminated_coords)
|
distances, indices = tree.query(
|
||||||
|
terminated_coords, distance_upper_bound=MAX_REMAP_DISTANCE_M
|
||||||
|
)
|
||||||
|
|
||||||
|
# cKDTree returns distance=inf and index==len(active) for points with no neighbour
|
||||||
|
# within the bound. Drop those terminated postcodes rather than gather an out-of-range
|
||||||
|
# index; they keep their terminated code and fall out at the active-postcode filter.
|
||||||
|
within_bound = np.isfinite(distances)
|
||||||
|
dropped = int((~within_bound).sum())
|
||||||
|
|
||||||
active_postcodes = active["pcds"]
|
active_postcodes = active["pcds"]
|
||||||
mapping = pl.DataFrame(
|
mapping = pl.DataFrame(
|
||||||
{
|
{
|
||||||
"old_postcode": terminated["pcds"],
|
"old_postcode": terminated["pcds"].filter(pl.Series(within_bound)),
|
||||||
"new_postcode": active_postcodes.gather(indices),
|
"new_postcode": active_postcodes.gather(indices[within_bound]),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
kept_distances = distances[within_bound]
|
||||||
print(
|
print(
|
||||||
f"Postcode mapping: max distance = {distances.max():.0f}m, median = {np.median(distances):.0f}m"
|
f"Postcode mapping: {dropped} terminated postcodes dropped (> {MAX_REMAP_DISTANCE_M:.0f}m), "
|
||||||
|
f"max distance = {kept_distances.max():.0f}m, median = {np.median(kept_distances):.0f}m"
|
||||||
|
if kept_distances.size
|
||||||
|
else f"Postcode mapping: {dropped} terminated postcodes dropped (> {MAX_REMAP_DISTANCE_M:.0f}m), none remapped"
|
||||||
)
|
)
|
||||||
|
|
||||||
return mapping
|
return mapping
|
||||||
|
|
|
||||||
|
|
@ -134,6 +134,91 @@ def test_fuzzy_join_on_postcode_rejects_blank_and_invalid_match_keys():
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_fuzzy_join_rejects_mid_score_number_less_match():
|
||||||
|
# "THE COACH HOUSE" vs "THE OLD COACH HOUSE" scores 88 via token_sort_ratio:
|
||||||
|
# above the old MIN_FUZZY_SCORE of 60 (so it used to falsely match) but below
|
||||||
|
# the number-less threshold of 90, so it must NOT match now.
|
||||||
|
left = pl.LazyFrame(
|
||||||
|
{
|
||||||
|
"left_address": ["The Coach House"],
|
||||||
|
"left_postcode": ["AB1 2CD"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
right = pl.LazyFrame(
|
||||||
|
{
|
||||||
|
"right_address": ["The Old Coach House"],
|
||||||
|
"right_postcode": ["AB1 2CD"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
result = fuzzy_join_on_postcode(
|
||||||
|
left=left,
|
||||||
|
right=right,
|
||||||
|
left_address_col="left_address",
|
||||||
|
right_address_col="right_address",
|
||||||
|
left_postcode_col="left_postcode",
|
||||||
|
right_postcode_col="right_postcode",
|
||||||
|
).collect()
|
||||||
|
|
||||||
|
assert result["right_address"].to_list() == [None]
|
||||||
|
|
||||||
|
|
||||||
|
def test_fuzzy_join_matches_numbered_pair_at_baseline_threshold():
|
||||||
|
# "10 ACACIA AVENUE" vs "FLAT A 10 ACACIA AVENUE" scores exactly 82 and the
|
||||||
|
# house number is compatible, so the numbered baseline (>= 82) still matches.
|
||||||
|
left = pl.LazyFrame(
|
||||||
|
{
|
||||||
|
"left_address": ["10 Acacia Avenue"],
|
||||||
|
"left_postcode": ["AB1 2CD"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
right = pl.LazyFrame(
|
||||||
|
{
|
||||||
|
"right_address": ["Flat A, 10 Acacia Avenue"],
|
||||||
|
"right_postcode": ["AB1 2CD"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
result = fuzzy_join_on_postcode(
|
||||||
|
left=left,
|
||||||
|
right=right,
|
||||||
|
left_address_col="left_address",
|
||||||
|
right_address_col="right_address",
|
||||||
|
left_postcode_col="left_postcode",
|
||||||
|
right_postcode_col="right_postcode",
|
||||||
|
).collect()
|
||||||
|
|
||||||
|
assert result["right_address"].to_list() == ["Flat A, 10 Acacia Avenue"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_fuzzy_join_matches_high_score_number_less_pair():
|
||||||
|
# A number-less pair that clears the 90 threshold (here an exact token match,
|
||||||
|
# score 100) must still match.
|
||||||
|
left = pl.LazyFrame(
|
||||||
|
{
|
||||||
|
"left_address": ["The Old Rectory"],
|
||||||
|
"left_postcode": ["AB1 2CD"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
right = pl.LazyFrame(
|
||||||
|
{
|
||||||
|
"right_address": ["THE OLD RECTORY"],
|
||||||
|
"right_postcode": ["AB1 2CD"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
result = fuzzy_join_on_postcode(
|
||||||
|
left=left,
|
||||||
|
right=right,
|
||||||
|
left_address_col="left_address",
|
||||||
|
right_address_col="right_address",
|
||||||
|
left_postcode_col="left_postcode",
|
||||||
|
right_postcode_col="right_postcode",
|
||||||
|
).collect()
|
||||||
|
|
||||||
|
assert result["right_address"].to_list() == ["THE OLD RECTORY"]
|
||||||
|
|
||||||
|
|
||||||
def test_normalize_postcode_key_requires_full_postcode():
|
def test_normalize_postcode_key_requires_full_postcode():
|
||||||
df = pl.DataFrame(
|
df = pl.DataFrame(
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,9 @@ where
|
||||||
|
|
||||||
pub use actual_listings::{ActualListing, ActualListingData};
|
pub use actual_listings::{ActualListing, ActualListingData};
|
||||||
pub use crime_by_year::CrimeByYearData;
|
pub use crime_by_year::CrimeByYearData;
|
||||||
pub use places::{normalize_search_text, PlaceData};
|
pub use places::{
|
||||||
|
compute_trigrams, normalize_search_text, place_alias_tokens, trigram_similarity, PlaceData,
|
||||||
|
};
|
||||||
pub use poi::{resolve_poi_category_filter, POICategoryGroup, POIData, SchoolMetadata};
|
pub use poi::{resolve_poi_category_filter, POICategoryGroup, POIData, SchoolMetadata};
|
||||||
pub use postcodes::{OutcodeData, PostcodeData};
|
pub use postcodes::{OutcodeData, PostcodeData};
|
||||||
pub use property::{
|
pub use property::{
|
||||||
|
|
|
||||||
|
|
@ -120,7 +120,7 @@ impl CrimeByYearData {
|
||||||
.list()
|
.list()
|
||||||
.with_context(|| format!("Column '{col_name}' is not a list"))?;
|
.with_context(|| format!("Column '{col_name}' is not a list"))?;
|
||||||
|
|
||||||
for row in 0..row_count {
|
for (row, postcode) in postcode_values.iter().enumerate().take(row_count) {
|
||||||
let Some(inner) = list_ca.get_as_series(row) else {
|
let Some(inner) = list_ca.get_as_series(row) else {
|
||||||
continue;
|
continue;
|
||||||
};
|
};
|
||||||
|
|
@ -163,7 +163,7 @@ impl CrimeByYearData {
|
||||||
points.sort_by_key(|p| p.year);
|
points.sort_by_key(|p| p.year);
|
||||||
|
|
||||||
series_by_postcode
|
series_by_postcode
|
||||||
.entry(postcode_values[row].clone())
|
.entry(postcode.clone())
|
||||||
.or_default()
|
.or_default()
|
||||||
.push(PostcodeCrimeSeries {
|
.push(PostcodeCrimeSeries {
|
||||||
type_idx: type_idx as u16,
|
type_idx: type_idx as u16,
|
||||||
|
|
|
||||||
|
|
@ -4,10 +4,16 @@ use anyhow::Context;
|
||||||
use polars::frame::DataFrame;
|
use polars::frame::DataFrame;
|
||||||
use polars::lazy::frame::LazyFrame;
|
use polars::lazy::frame::LazyFrame;
|
||||||
use polars::prelude::*;
|
use polars::prelude::*;
|
||||||
|
use rustc_hash::FxHashMap;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
use crate::utils::InternedColumn;
|
use crate::utils::InternedColumn;
|
||||||
|
|
||||||
|
/// Upper bound on place rows scored per query (candidate sets are normally far smaller).
|
||||||
|
const PLACE_CANDIDATE_LIMIT: usize = 50_000;
|
||||||
|
const PLACE_PREFIX_MIN_LEN: usize = 2;
|
||||||
|
const PLACE_PREFIX_MAX_LEN: usize = 6;
|
||||||
|
|
||||||
pub struct PlaceData {
|
pub struct PlaceData {
|
||||||
pub name: Vec<String>,
|
pub name: Vec<String>,
|
||||||
pub name_lower: Vec<String>,
|
pub name_lower: Vec<String>,
|
||||||
|
|
@ -19,6 +25,13 @@ pub struct PlaceData {
|
||||||
pub lon: Vec<f32>,
|
pub lon: Vec<f32>,
|
||||||
pub city: Vec<Option<String>>,
|
pub city: Vec<Option<String>>,
|
||||||
pub travel_destination: Vec<bool>,
|
pub travel_destination: Vec<bool>,
|
||||||
|
/// Inverted index from an alias token to the (ascending) place rows containing it. Lets place
|
||||||
|
/// search gather candidates instead of scanning all ~1M+ rows per keystroke.
|
||||||
|
token_index: FxHashMap<String, Vec<u32>>,
|
||||||
|
/// Prefix → indexed tokens, for matching a partially-typed final word.
|
||||||
|
token_prefix_index: FxHashMap<String, Vec<String>>,
|
||||||
|
/// Trigram → fuzzy-eligible rows (settlements/stations only), for bounded typo matching.
|
||||||
|
fuzzy_trigram_index: FxHashMap<u32, Vec<u32>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy)]
|
#[derive(Clone, Copy)]
|
||||||
|
|
@ -168,6 +181,148 @@ pub fn normalize_search_text(text: &str) -> String {
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Tokens across all of a place's search aliases (split on word and alias separators),
|
||||||
|
/// for token-AND matching where every query word must prefix-match some place token.
|
||||||
|
pub fn place_alias_tokens(search_text: &str) -> impl Iterator<Item = &str> {
|
||||||
|
search_text
|
||||||
|
.split([' ', '|'])
|
||||||
|
.filter(|token| !token.is_empty())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn trigram_hash(first: char, second: char, third: char) -> u32 {
|
||||||
|
let mut hash = 2_166_136_261u32;
|
||||||
|
for ch in [first, second, third] {
|
||||||
|
hash = (hash ^ (ch as u32)).wrapping_mul(16_777_619);
|
||||||
|
}
|
||||||
|
hash
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Sorted, de-duplicated padded character trigrams of `text`, for Jaccard fuzzy matching.
|
||||||
|
pub fn compute_trigrams(text: &str) -> Vec<u32> {
|
||||||
|
let norm = normalize_search_text(text);
|
||||||
|
if norm.is_empty() {
|
||||||
|
return Vec::new();
|
||||||
|
}
|
||||||
|
let chars: Vec<char> = [' ', ' ']
|
||||||
|
.into_iter()
|
||||||
|
.chain(norm.chars())
|
||||||
|
.chain(std::iter::once(' '))
|
||||||
|
.collect();
|
||||||
|
let mut grams: Vec<u32> = chars
|
||||||
|
.windows(3)
|
||||||
|
.map(|window| trigram_hash(window[0], window[1], window[2]))
|
||||||
|
.collect();
|
||||||
|
grams.sort_unstable();
|
||||||
|
grams.dedup();
|
||||||
|
grams
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Intersect two ascending-sorted row-id slices.
|
||||||
|
fn intersect_sorted(left: &[u32], right: &[u32]) -> Vec<u32> {
|
||||||
|
let mut out = Vec::new();
|
||||||
|
let (mut i, mut j) = (0, 0);
|
||||||
|
while i < left.len() && j < right.len() {
|
||||||
|
match left[i].cmp(&right[j]) {
|
||||||
|
std::cmp::Ordering::Less => i += 1,
|
||||||
|
std::cmp::Ordering::Greater => j += 1,
|
||||||
|
std::cmp::Ordering::Equal => {
|
||||||
|
out.push(left[i]);
|
||||||
|
i += 1;
|
||||||
|
j += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Union two ascending-sorted row-id slices (deduplicated, stays sorted).
|
||||||
|
fn union_sorted(left: &[u32], right: &[u32]) -> Vec<u32> {
|
||||||
|
let mut out = Vec::with_capacity(left.len() + right.len());
|
||||||
|
let (mut i, mut j) = (0, 0);
|
||||||
|
while i < left.len() && j < right.len() {
|
||||||
|
match left[i].cmp(&right[j]) {
|
||||||
|
std::cmp::Ordering::Less => {
|
||||||
|
out.push(left[i]);
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
std::cmp::Ordering::Greater => {
|
||||||
|
out.push(right[j]);
|
||||||
|
j += 1;
|
||||||
|
}
|
||||||
|
std::cmp::Ordering::Equal => {
|
||||||
|
out.push(left[i]);
|
||||||
|
i += 1;
|
||||||
|
j += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out.extend_from_slice(&left[i..]);
|
||||||
|
out.extend_from_slice(&right[j..]);
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Distinct indexable tokens (len ≥ 2) across all of a place's search aliases. ASCII because
|
||||||
|
/// `normalize_search_text` already dropped non-alphanumerics, so prefix byte-slicing is safe.
|
||||||
|
fn place_index_tokens(search_text: &str) -> Vec<String> {
|
||||||
|
let mut tokens: Vec<String> = place_alias_tokens(search_text)
|
||||||
|
.filter(|token| token.len() >= 2)
|
||||||
|
.map(ToString::to_string)
|
||||||
|
.collect();
|
||||||
|
tokens.sort_unstable();
|
||||||
|
tokens.dedup();
|
||||||
|
tokens
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_place_prefix_index(
|
||||||
|
token_index: &FxHashMap<String, Vec<u32>>,
|
||||||
|
) -> FxHashMap<String, Vec<String>> {
|
||||||
|
let mut prefix_index: FxHashMap<String, Vec<String>> = FxHashMap::default();
|
||||||
|
for token in token_index.keys() {
|
||||||
|
let max_len = token.len().min(PLACE_PREFIX_MAX_LEN);
|
||||||
|
for len in PLACE_PREFIX_MIN_LEN..=max_len {
|
||||||
|
prefix_index
|
||||||
|
.entry(token[..len].to_string())
|
||||||
|
.or_default()
|
||||||
|
.push(token.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for tokens in prefix_index.values_mut() {
|
||||||
|
tokens.sort_unstable();
|
||||||
|
tokens.dedup();
|
||||||
|
}
|
||||||
|
prefix_index
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether a place type participates in fuzzy (typo) matching. Settlements/stations/universities
|
||||||
|
/// do; the ~1M streets and POIs do not (people rarely misspell a road and it keeps fuzzy bounded).
|
||||||
|
fn is_fuzzy_eligible_type(place_type: &str) -> bool {
|
||||||
|
!matches!(
|
||||||
|
place_type,
|
||||||
|
"street" | "park" | "attraction" | "hospital" | "retail"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Jaccard similarity between two sorted trigram sets (0.0–1.0).
|
||||||
|
pub fn trigram_similarity(left: &[u32], right: &[u32]) -> f32 {
|
||||||
|
if left.is_empty() || right.is_empty() {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
let (mut i, mut j, mut intersection) = (0, 0, 0usize);
|
||||||
|
while i < left.len() && j < right.len() {
|
||||||
|
match left[i].cmp(&right[j]) {
|
||||||
|
std::cmp::Ordering::Less => i += 1,
|
||||||
|
std::cmp::Ordering::Greater => j += 1,
|
||||||
|
std::cmp::Ordering::Equal => {
|
||||||
|
intersection += 1;
|
||||||
|
i += 1;
|
||||||
|
j += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let union = left.len() + right.len() - intersection;
|
||||||
|
intersection as f32 / union as f32
|
||||||
|
}
|
||||||
|
|
||||||
fn replace_token(text: &str, from: &str, to: &str) -> Option<String> {
|
fn replace_token(text: &str, from: &str, to: &str) -> Option<String> {
|
||||||
let mut changed = false;
|
let mut changed = false;
|
||||||
let replaced: Vec<&str> = text
|
let replaced: Vec<&str> = text
|
||||||
|
|
@ -191,15 +346,31 @@ fn push_alias(aliases: &mut Vec<String>, alias: String) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Bidirectional token abbreviations expanded into search aliases so a query typed either
|
||||||
|
/// way matches (e.g. "gt missenden" ↔ "Great Missenden", "mt" ↔ "Mount").
|
||||||
|
const PLACE_TOKEN_ALIASES: &[(&str, &str)] = &[
|
||||||
|
("st", "saint"),
|
||||||
|
("saint", "st"),
|
||||||
|
("mt", "mount"),
|
||||||
|
("mount", "mt"),
|
||||||
|
("gt", "great"),
|
||||||
|
("great", "gt"),
|
||||||
|
("lt", "little"),
|
||||||
|
("little", "lt"),
|
||||||
|
("upr", "upper"),
|
||||||
|
("upper", "upr"),
|
||||||
|
("lwr", "lower"),
|
||||||
|
("lower", "lwr"),
|
||||||
|
];
|
||||||
|
|
||||||
fn build_search_text(name: &str, place_type: &str) -> String {
|
fn build_search_text(name: &str, place_type: &str) -> String {
|
||||||
let primary = normalize_search_text(name);
|
let primary = normalize_search_text(name);
|
||||||
let mut aliases = vec![primary.clone()];
|
let mut aliases = vec![primary.clone()];
|
||||||
|
|
||||||
if let Some(alias) = replace_token(&primary, "st", "saint") {
|
for (from, to) in PLACE_TOKEN_ALIASES {
|
||||||
|
if let Some(alias) = replace_token(&primary, from, to) {
|
||||||
push_alias(&mut aliases, alias);
|
push_alias(&mut aliases, alias);
|
||||||
}
|
}
|
||||||
if let Some(alias) = replace_token(&primary, "saint", "st") {
|
|
||||||
push_alias(&mut aliases, alias);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if place_type == "station" {
|
if place_type == "station" {
|
||||||
|
|
@ -391,6 +562,26 @@ impl PlaceData {
|
||||||
fallback_city
|
fallback_city
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Build the place search index: an inverted token index over all rows (so the per-query
|
||||||
|
// cost scales with matched candidates, not the ~1M-row corpus), plus a trigram index over
|
||||||
|
// only fuzzy-eligible rows for bounded typo matching.
|
||||||
|
let mut token_index: FxHashMap<String, Vec<u32>> = FxHashMap::default();
|
||||||
|
let mut fuzzy_trigram_index: FxHashMap<u32, Vec<u32>> = FxHashMap::default();
|
||||||
|
for idx in 0..row_count {
|
||||||
|
for token in place_index_tokens(&name_search[idx]) {
|
||||||
|
token_index.entry(token).or_default().push(idx as u32);
|
||||||
|
}
|
||||||
|
if is_fuzzy_eligible_type(&place_type_raw[idx]) {
|
||||||
|
for trigram in compute_trigrams(&name[idx]) {
|
||||||
|
fuzzy_trigram_index
|
||||||
|
.entry(trigram)
|
||||||
|
.or_default()
|
||||||
|
.push(idx as u32);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let token_prefix_index = build_place_prefix_index(&token_index);
|
||||||
|
|
||||||
let with_pop = population.iter().filter(|&&pop| pop > 0).count();
|
let with_pop = population.iter().filter(|&&pop| pop > 0).count();
|
||||||
let with_city = city.iter().filter(|c| c.is_some()).count();
|
let with_city = city.iter().filter(|c| c.is_some()).count();
|
||||||
info!(
|
info!(
|
||||||
|
|
@ -398,6 +589,8 @@ impl PlaceData {
|
||||||
types = place_type.values.len(),
|
types = place_type.values.len(),
|
||||||
with_population = with_pop,
|
with_population = with_pop,
|
||||||
with_city = with_city,
|
with_city = with_city,
|
||||||
|
tokens = token_index.len(),
|
||||||
|
fuzzy_trigrams = fuzzy_trigram_index.len(),
|
||||||
"Place data loaded"
|
"Place data loaded"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
@ -412,14 +605,261 @@ impl PlaceData {
|
||||||
lon,
|
lon,
|
||||||
city,
|
city,
|
||||||
travel_destination,
|
travel_destination,
|
||||||
|
token_index,
|
||||||
|
token_prefix_index,
|
||||||
|
fuzzy_trigram_index,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Candidate place rows for the query content tokens: intersect the posting lists of words
|
||||||
|
/// typed in full; if none matched an indexed token exactly, seed from the smallest
|
||||||
|
/// prefix-expanded list (so a partially-typed final word still works). Bounded by
|
||||||
|
/// `PLACE_CANDIDATE_LIMIT`.
|
||||||
|
pub fn place_candidate_rows(&self, tokens: &[&str]) -> Vec<u32> {
|
||||||
|
let mut exact: Vec<&[u32]> = tokens
|
||||||
|
.iter()
|
||||||
|
.filter_map(|token| self.token_index.get(*token).map(Vec::as_slice))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let mut rows = if exact.is_empty() {
|
||||||
|
self.place_prefix_seed(tokens)
|
||||||
|
} else {
|
||||||
|
exact.sort_by_key(|posting| posting.len());
|
||||||
|
let mut acc = exact[0].to_vec();
|
||||||
|
for posting in &exact[1..] {
|
||||||
|
if acc.is_empty() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
acc = intersect_sorted(&acc, posting);
|
||||||
|
}
|
||||||
|
acc
|
||||||
|
};
|
||||||
|
rows.truncate(PLACE_CANDIDATE_LIMIT);
|
||||||
|
rows
|
||||||
|
}
|
||||||
|
|
||||||
|
fn place_prefix_seed(&self, tokens: &[&str]) -> Vec<u32> {
|
||||||
|
let mut best: Option<Vec<u32>> = None;
|
||||||
|
for token in tokens {
|
||||||
|
if token.len() < PLACE_PREFIX_MIN_LEN {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let key = &token[..token.len().min(PLACE_PREFIX_MAX_LEN)];
|
||||||
|
let Some(indexed) = self.token_prefix_index.get(key) else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
let mut union: Vec<u32> = Vec::new();
|
||||||
|
for indexed_token in indexed {
|
||||||
|
if !indexed_token.starts_with(token) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if let Some(rows) = self.token_index.get(indexed_token) {
|
||||||
|
union = if union.is_empty() {
|
||||||
|
rows.clone()
|
||||||
|
} else {
|
||||||
|
union_sorted(&union, rows)
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !union.is_empty()
|
||||||
|
&& best
|
||||||
|
.as_ref()
|
||||||
|
.is_none_or(|current| union.len() < current.len())
|
||||||
|
{
|
||||||
|
best = Some(union);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
best.unwrap_or_default()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fuzzy-eligible rows sharing enough trigrams with the query to be worth Jaccard scoring.
|
||||||
|
/// Bounded by the (small) fuzzy trigram index rather than scanning every place.
|
||||||
|
pub fn fuzzy_candidate_rows(&self, query_trigrams: &[u32]) -> Vec<u32> {
|
||||||
|
if query_trigrams.is_empty() {
|
||||||
|
return Vec::new();
|
||||||
|
}
|
||||||
|
let mut counts: FxHashMap<u32, u16> = FxHashMap::default();
|
||||||
|
for trigram in query_trigrams {
|
||||||
|
if let Some(rows) = self.fuzzy_trigram_index.get(trigram) {
|
||||||
|
for &row in rows {
|
||||||
|
*counts.entry(row).or_default() += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let min_shared = (((query_trigrams.len() as f32) * 0.4).ceil() as u16).max(1);
|
||||||
|
counts
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(|(row, shared)| (shared >= min_shared).then_some(row))
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
impl PlaceData {
|
||||||
|
/// Build a minimal PlaceData from (name, place_type) pairs for index tests.
|
||||||
|
fn from_names<S: AsRef<str>>(rows: &[(S, S)]) -> Self {
|
||||||
|
let name: Vec<String> = rows.iter().map(|(nm, _)| nm.as_ref().to_string()).collect();
|
||||||
|
let place_type_raw: Vec<String> =
|
||||||
|
rows.iter().map(|(_, pt)| pt.as_ref().to_string()).collect();
|
||||||
|
let name_lower: Vec<String> = name.iter().map(|nm| nm.to_lowercase()).collect();
|
||||||
|
let name_search: Vec<String> = name
|
||||||
|
.iter()
|
||||||
|
.zip(&place_type_raw)
|
||||||
|
.map(|(nm, pt)| build_search_text(nm, pt))
|
||||||
|
.collect();
|
||||||
|
let mut token_index: FxHashMap<String, Vec<u32>> = FxHashMap::default();
|
||||||
|
let mut fuzzy_trigram_index: FxHashMap<u32, Vec<u32>> = FxHashMap::default();
|
||||||
|
for idx in 0..name.len() {
|
||||||
|
for token in place_index_tokens(&name_search[idx]) {
|
||||||
|
token_index.entry(token).or_default().push(idx as u32);
|
||||||
|
}
|
||||||
|
if is_fuzzy_eligible_type(&place_type_raw[idx]) {
|
||||||
|
for trigram in compute_trigrams(&name[idx]) {
|
||||||
|
fuzzy_trigram_index
|
||||||
|
.entry(trigram)
|
||||||
|
.or_default()
|
||||||
|
.push(idx as u32);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let token_prefix_index = build_place_prefix_index(&token_index);
|
||||||
|
let len = name.len();
|
||||||
|
PlaceData {
|
||||||
|
name,
|
||||||
|
name_lower,
|
||||||
|
name_search,
|
||||||
|
place_type: InternedColumn::build(&place_type_raw),
|
||||||
|
type_rank: place_type_raw.iter().map(|pt| type_rank(pt)).collect(),
|
||||||
|
population: vec![0; len],
|
||||||
|
lat: vec![0.0; len],
|
||||||
|
lon: vec![0.0; len],
|
||||||
|
city: vec![None; len],
|
||||||
|
travel_destination: vec![false; len],
|
||||||
|
token_index,
|
||||||
|
token_prefix_index,
|
||||||
|
fuzzy_trigram_index,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn place_index_tokens_dedup_and_min_length() {
|
||||||
|
// "a" is too short; aliases split on " | ".
|
||||||
|
assert_eq!(
|
||||||
|
place_index_tokens("st albans | saint albans"),
|
||||||
|
vec!["albans".to_string(), "saint".to_string(), "st".to_string()]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn place_candidate_rows_intersect_and_prefix_seed() {
|
||||||
|
let pd = PlaceData::from_names(&[
|
||||||
|
("Camden", "suburb"),
|
||||||
|
("Camden Town", "suburb"),
|
||||||
|
("Camden Market", "attraction"),
|
||||||
|
("Manchester", "city"),
|
||||||
|
("Manchester Piccadilly", "station"),
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Full word → posting list (Camden, Camden Town, Camden Market).
|
||||||
|
let camden = pd.place_candidate_rows(&["camden"]);
|
||||||
|
assert_eq!(camden, vec![0, 1, 2]);
|
||||||
|
|
||||||
|
// Two full words intersect to rows containing BOTH (Camden Town only).
|
||||||
|
let camden_town = pd.place_candidate_rows(&["camden", "town"]);
|
||||||
|
assert_eq!(camden_town, vec![1]);
|
||||||
|
|
||||||
|
// A partially-typed final word with no exact token seeds from the prefix index.
|
||||||
|
let piccad = pd.place_candidate_rows(&["piccad"]);
|
||||||
|
assert_eq!(piccad, vec![4]);
|
||||||
|
|
||||||
|
// No match → empty.
|
||||||
|
assert!(pd.place_candidate_rows(&["zzzz"]).is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run with: cargo test --release bench_place_search -- --ignored --nocapture
|
||||||
|
#[test]
|
||||||
|
#[ignore]
|
||||||
|
fn bench_place_search_at_one_million_rows() {
|
||||||
|
let roads = [
|
||||||
|
"High Street",
|
||||||
|
"Station Road",
|
||||||
|
"Church Lane",
|
||||||
|
"Victoria Road",
|
||||||
|
"Mill Lane",
|
||||||
|
"Park Avenue",
|
||||||
|
"Queens Road",
|
||||||
|
"Kings Road",
|
||||||
|
];
|
||||||
|
let mut rows: Vec<(String, String)> = Vec::with_capacity(1_000_000);
|
||||||
|
for i in 0..1_000_000usize {
|
||||||
|
// Vary the name so the index resembles ~1M distinct (street, area) rows.
|
||||||
|
rows.push((
|
||||||
|
format!("{} {}", roads[i % roads.len()], i % 4000),
|
||||||
|
"street".into(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
rows.push(("London".into(), "city".into()));
|
||||||
|
let pd = PlaceData::from_names(&rows);
|
||||||
|
|
||||||
|
let start = std::time::Instant::now();
|
||||||
|
let mut hits = 0usize;
|
||||||
|
for _ in 0..50 {
|
||||||
|
let candidates = pd.place_candidate_rows(&["high", "street"]);
|
||||||
|
for row in candidates {
|
||||||
|
let idx = row as usize;
|
||||||
|
if place_search_test_score(&pd, idx, "high street", &["high", "street"]).is_some() {
|
||||||
|
hits += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let per_query = start.elapsed() / 50;
|
||||||
|
println!(
|
||||||
|
"indexed place search over {} rows: {:?}/query ({} hits)",
|
||||||
|
pd.name.len(),
|
||||||
|
per_query,
|
||||||
|
hits / 50
|
||||||
|
);
|
||||||
|
// The old full O(N) scan measured ~36ms here; candidate-based must be far under that.
|
||||||
|
assert!(per_query.as_millis() < 10, "per_query was {per_query:?}");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mirrors the route's per-candidate match check for the bench.
|
||||||
|
fn place_search_test_score(
|
||||||
|
pd: &PlaceData,
|
||||||
|
idx: usize,
|
||||||
|
query_search: &str,
|
||||||
|
query_tokens: &[&str],
|
||||||
|
) -> Option<f32> {
|
||||||
|
let search_text = &pd.name_search[idx];
|
||||||
|
if query_tokens.iter().all(|qt| {
|
||||||
|
place_alias_tokens(search_text)
|
||||||
|
.any(|t| t == *qt || (qt.len() >= 2 && t.starts_with(qt)))
|
||||||
|
}) {
|
||||||
|
Some(640.0)
|
||||||
|
} else if pd.name_lower[idx] == query_search {
|
||||||
|
Some(1000.0)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn fuzzy_candidate_rows_finds_typos_only_for_eligible_rows() {
|
||||||
|
let pd = PlaceData::from_names(&[
|
||||||
|
("London", "city"),
|
||||||
|
("Baker Street", "street"), // not fuzzy-eligible
|
||||||
|
]);
|
||||||
|
let typo = compute_trigrams("Londn");
|
||||||
|
let candidates = pd.fuzzy_candidate_rows(&typo);
|
||||||
|
assert!(candidates.contains(&0)); // London (city) is reachable by fuzzy
|
||||||
|
assert!(!candidates.contains(&1)); // streets are excluded from the fuzzy index
|
||||||
|
}
|
||||||
|
|
||||||
fn test_city_rows() -> [(&'static str, f32, f32, u32); 5] {
|
fn test_city_rows() -> [(&'static str, f32, f32, u32); 5] {
|
||||||
[
|
[
|
||||||
("London", 51.507_446, -0.1277653, 8_908_083),
|
("London", 51.507_446, -0.1277653, 8_908_083),
|
||||||
|
|
@ -470,6 +910,29 @@ mod tests {
|
||||||
assert!(build_search_text("Shadwell DLR station", "station").contains("shadwell station"));
|
assert!(build_search_text("Shadwell DLR station", "station").contains("shadwell station"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn search_text_expands_directional_and_size_abbreviations() {
|
||||||
|
assert!(build_search_text("Great Missenden", "village").contains("gt missenden"));
|
||||||
|
assert!(build_search_text("Mount Pleasant", "suburb").contains("mt pleasant"));
|
||||||
|
assert!(build_search_text("Little Venice", "suburb").contains("lt venice"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn trigram_similarity_is_high_for_typos_and_low_for_unrelated() {
|
||||||
|
let london = compute_trigrams("London");
|
||||||
|
let typo = compute_trigrams("Londn");
|
||||||
|
let other = compute_trigrams("Manchester");
|
||||||
|
assert!(trigram_similarity(&london, &typo) >= 0.4);
|
||||||
|
assert!(trigram_similarity(&london, &other) < 0.2);
|
||||||
|
assert!((trigram_similarity(&london, &london) - 1.0).abs() < 1e-6);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn place_alias_tokens_split_across_aliases() {
|
||||||
|
let tokens: Vec<&str> = place_alias_tokens("kings cross | kings x").collect();
|
||||||
|
assert_eq!(tokens, vec!["kings", "cross", "kings", "x"]);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn travel_destination_types_match_legacy_places() {
|
fn travel_destination_types_match_legacy_places() {
|
||||||
assert!(is_travel_destination_type("city"));
|
assert!(is_travel_destination_type("city"));
|
||||||
|
|
|
||||||
|
|
@ -398,7 +398,7 @@ fn build_school_meta(
|
||||||
|
|
||||||
let mut idx = vec![u32::MAX; row_count];
|
let mut idx = vec![u32::MAX; row_count];
|
||||||
let mut meta = Vec::new();
|
let mut meta = Vec::new();
|
||||||
for row in 0..row_count {
|
for (row, meta_idx) in idx.iter_mut().enumerate().take(row_count) {
|
||||||
let type_group_val = fetch_str(&type_group, row);
|
let type_group_val = fetch_str(&type_group, row);
|
||||||
let type_val = fetch_str(&r#type, row);
|
let type_val = fetch_str(&r#type, row);
|
||||||
// type_group is present for every GIAS row, so use it as the sentinel
|
// type_group is present for every GIAS row, so use it as the sentinel
|
||||||
|
|
@ -406,7 +406,7 @@ fn build_school_meta(
|
||||||
if type_group_val.is_none() && type_val.is_none() {
|
if type_group_val.is_none() && type_val.is_none() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
idx[row] = meta.len() as u32;
|
*meta_idx = meta.len() as u32;
|
||||||
meta.push(SchoolMetadata {
|
meta.push(SchoolMetadata {
|
||||||
phase: fetch_str(&phase, row),
|
phase: fetch_str(&phase, row),
|
||||||
r#type: type_val,
|
r#type: type_val,
|
||||||
|
|
|
||||||
|
|
@ -10,8 +10,10 @@ use rustc_hash::{FxHashMap, FxHashSet};
|
||||||
use crate::consts::{H3_PRECOMPUTE_MAX, HISTOGRAM_BINS, NAN_U16, QUANT_SCALE};
|
use crate::consts::{H3_PRECOMPUTE_MAX, HISTOGRAM_BINS, NAN_U16, QUANT_SCALE};
|
||||||
use crate::features::{self, Bounds};
|
use crate::features::{self, Bounds};
|
||||||
|
|
||||||
const ADDRESS_SEARCH_CANDIDATE_LIMIT: usize = 50_000;
|
/// Upper bound on rows scored per query. Intersection keeps most candidate sets far below
|
||||||
const ADDRESS_SEARCH_MAX_POSTINGS_PER_TOKEN: usize = 250_000;
|
/// this; only a single very common road word (e.g. "high") approaches it, and the in-area
|
||||||
|
/// priority sort keeps a refined query's matches ahead of the cut.
|
||||||
|
const ADDRESS_SEARCH_CANDIDATE_LIMIT: usize = 150_000;
|
||||||
const ADDRESS_SEARCH_PREFIX_MIN_LEN: usize = 4;
|
const ADDRESS_SEARCH_PREFIX_MIN_LEN: usize = 4;
|
||||||
const ADDRESS_SEARCH_PREFIX_MAX_LEN: usize = 8;
|
const ADDRESS_SEARCH_PREFIX_MAX_LEN: usize = 8;
|
||||||
const NO_POI_METRIC_ROW: u32 = u32::MAX;
|
const NO_POI_METRIC_ROW: u32 = u32::MAX;
|
||||||
|
|
@ -162,6 +164,11 @@ struct AddressTermGroup {
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
struct AddressQuery {
|
struct AddressQuery {
|
||||||
full_postcode: Option<String>,
|
full_postcode: Option<String>,
|
||||||
|
/// Compact uppercase outward code (optionally with a sector digit) recovered when the
|
||||||
|
/// user appended a partial postcode like "NW1" or "NW1 6". Used as an additive ranking
|
||||||
|
/// bias, never as a hard filter — so the disambiguating hint is honoured without
|
||||||
|
/// excluding the same road in other areas.
|
||||||
|
postcode_area: Option<String>,
|
||||||
text_groups: Vec<AddressTermGroup>,
|
text_groups: Vec<AddressTermGroup>,
|
||||||
numeric_terms: Vec<String>,
|
numeric_terms: Vec<String>,
|
||||||
candidate_terms: Vec<String>,
|
candidate_terms: Vec<String>,
|
||||||
|
|
@ -442,6 +449,138 @@ fn build_address_prefix_index(
|
||||||
prefix_index
|
prefix_index
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Intersect two ascending-sorted row-id slices.
|
||||||
|
fn intersect_sorted(left: &[u32], right: &[u32]) -> Vec<u32> {
|
||||||
|
let mut out = Vec::new();
|
||||||
|
let (mut i, mut j) = (0, 0);
|
||||||
|
while i < left.len() && j < right.len() {
|
||||||
|
match left[i].cmp(&right[j]) {
|
||||||
|
std::cmp::Ordering::Less => i += 1,
|
||||||
|
std::cmp::Ordering::Greater => j += 1,
|
||||||
|
std::cmp::Ordering::Equal => {
|
||||||
|
out.push(left[i]);
|
||||||
|
i += 1;
|
||||||
|
j += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Union two ascending-sorted row-id slices (deduplicated, stays sorted).
|
||||||
|
fn union_sorted(left: &[u32], right: &[u32]) -> Vec<u32> {
|
||||||
|
let mut out = Vec::with_capacity(left.len() + right.len());
|
||||||
|
let (mut i, mut j) = (0, 0);
|
||||||
|
while i < left.len() && j < right.len() {
|
||||||
|
match left[i].cmp(&right[j]) {
|
||||||
|
std::cmp::Ordering::Less => {
|
||||||
|
out.push(left[i]);
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
std::cmp::Ordering::Greater => {
|
||||||
|
out.push(right[j]);
|
||||||
|
j += 1;
|
||||||
|
}
|
||||||
|
std::cmp::Ordering::Equal => {
|
||||||
|
out.push(left[i]);
|
||||||
|
i += 1;
|
||||||
|
j += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out.extend_from_slice(&left[i..]);
|
||||||
|
out.extend_from_slice(&right[j..]);
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An ordinal like "1st", "2nd", "3rd", "21st" — part of the street name ("2nd Avenue"), not a
|
||||||
|
/// house-number prefix.
|
||||||
|
fn is_ordinal_token(token: &str) -> bool {
|
||||||
|
let split = token.len().saturating_sub(2);
|
||||||
|
let (digits, suffix) = token.split_at(split);
|
||||||
|
!digits.is_empty()
|
||||||
|
&& digits.chars().all(|ch| ch.is_ascii_digit())
|
||||||
|
&& matches!(suffix, "st" | "nd" | "rd" | "th")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Leading address tokens that denote a unit/house number rather than the street itself.
|
||||||
|
fn is_house_prefix_token(token: &str) -> bool {
|
||||||
|
if is_ordinal_token(token) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
matches!(
|
||||||
|
token,
|
||||||
|
"flat" | "fl" | "apartment" | "apt" | "unit" | "no" | "block" | "floor" | "room"
|
||||||
|
) || token.len() == 1
|
||||||
|
|| token.chars().all(|ch| ch.is_ascii_digit())
|
||||||
|
|| (token.chars().next().is_some_and(|ch| ch.is_ascii_digit())
|
||||||
|
&& token.chars().any(|ch| ch.is_ascii_alphabetic()))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Street-level key for an address: drops the leading house-number / flat prefix so that
|
||||||
|
/// "12 Baker Street" and "5 Baker Street" collapse to a single street entry.
|
||||||
|
fn street_key(address: &str) -> String {
|
||||||
|
let tokens = tokenize_address_text(address);
|
||||||
|
let mut start = 0;
|
||||||
|
while start < tokens.len() && is_house_prefix_token(&tokens[start]) {
|
||||||
|
start += 1;
|
||||||
|
}
|
||||||
|
if start >= tokens.len() {
|
||||||
|
return tokens.join(" ");
|
||||||
|
}
|
||||||
|
tokens[start..].join(" ")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Road-type words. Their presence (with no house number) marks a road browse, which we
|
||||||
|
/// collapse to one result per street.
|
||||||
|
const ROAD_TYPE_TOKENS: &[&str] = &[
|
||||||
|
"street",
|
||||||
|
"st",
|
||||||
|
"road",
|
||||||
|
"rd",
|
||||||
|
"lane",
|
||||||
|
"ln",
|
||||||
|
"avenue",
|
||||||
|
"ave",
|
||||||
|
"close",
|
||||||
|
"cl",
|
||||||
|
"drive",
|
||||||
|
"dr",
|
||||||
|
"way",
|
||||||
|
"court",
|
||||||
|
"ct",
|
||||||
|
"crescent",
|
||||||
|
"cres",
|
||||||
|
"place",
|
||||||
|
"terrace",
|
||||||
|
"terr",
|
||||||
|
"grove",
|
||||||
|
"gardens",
|
||||||
|
"gdns",
|
||||||
|
"walk",
|
||||||
|
"row",
|
||||||
|
"square",
|
||||||
|
"sq",
|
||||||
|
"hill",
|
||||||
|
"parade",
|
||||||
|
"mews",
|
||||||
|
"embankment",
|
||||||
|
"broadway",
|
||||||
|
"boulevard",
|
||||||
|
"blvd",
|
||||||
|
];
|
||||||
|
|
||||||
|
fn query_has_road_type(query: &str) -> bool {
|
||||||
|
tokenize_address_text(query)
|
||||||
|
.iter()
|
||||||
|
.any(|token| ROAD_TYPE_TOKENS.contains(&token.as_str()))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The outward code (everything before the space) of a canonical postcode.
|
||||||
|
fn outcode_of(postcode: &str) -> &str {
|
||||||
|
postcode.split(' ').next().unwrap_or(postcode)
|
||||||
|
}
|
||||||
|
|
||||||
fn parse_address_query(query: &str) -> AddressQuery {
|
fn parse_address_query(query: &str) -> AddressQuery {
|
||||||
let tokens = tokenize_address_text(query);
|
let tokens = tokenize_address_text(query);
|
||||||
let (full_postcode, postcode_token_indices) = extract_full_postcode(&tokens)
|
let (full_postcode, postcode_token_indices) = extract_full_postcode(&tokens)
|
||||||
|
|
@ -449,12 +588,45 @@ fn parse_address_query(query: &str) -> AddressQuery {
|
||||||
.unwrap_or((None, Vec::new()));
|
.unwrap_or((None, Vec::new()));
|
||||||
|
|
||||||
let skip_postcode_tokens: FxHashSet<usize> = postcode_token_indices.into_iter().collect();
|
let skip_postcode_tokens: FxHashSet<usize> = postcode_token_indices.into_iter().collect();
|
||||||
|
|
||||||
|
// Recover an appended partial postcode (outcode, or outcode + sector digit) as a ranking
|
||||||
|
// bias rather than discarding it — but only from the TRAILING position, so a leading road
|
||||||
|
// designation like "A4 Great West Road" is not mistaken for an area refinement.
|
||||||
|
let mut postcode_area: Option<String> = None;
|
||||||
|
let mut consumed_partial_tokens: FxHashSet<usize> = FxHashSet::default();
|
||||||
|
if full_postcode.is_none() && !tokens.is_empty() {
|
||||||
|
let last = tokens.len() - 1;
|
||||||
|
if !skip_postcode_tokens.contains(&last) {
|
||||||
|
let sector_digit =
|
||||||
|
tokens[last].len() == 1 && tokens[last].chars().all(|ch| ch.is_ascii_digit());
|
||||||
|
if last >= 1
|
||||||
|
&& sector_digit
|
||||||
|
&& !skip_postcode_tokens.contains(&(last - 1))
|
||||||
|
&& looks_like_postcode_fragment(&tokens[last - 1])
|
||||||
|
{
|
||||||
|
postcode_area = Some(format!(
|
||||||
|
"{}{}",
|
||||||
|
tokens[last - 1].to_ascii_uppercase(),
|
||||||
|
tokens[last]
|
||||||
|
));
|
||||||
|
consumed_partial_tokens.insert(last);
|
||||||
|
consumed_partial_tokens.insert(last - 1);
|
||||||
|
} else if looks_like_postcode_fragment(&tokens[last]) {
|
||||||
|
postcode_area = Some(tokens[last].to_ascii_uppercase());
|
||||||
|
consumed_partial_tokens.insert(last);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let mut text_groups = Vec::new();
|
let mut text_groups = Vec::new();
|
||||||
let mut numeric_terms = Vec::new();
|
let mut numeric_terms = Vec::new();
|
||||||
let mut candidate_terms = Vec::new();
|
let mut candidate_terms = Vec::new();
|
||||||
|
|
||||||
for (idx, token) in tokens.iter().enumerate() {
|
for (idx, token) in tokens.iter().enumerate() {
|
||||||
if skip_postcode_tokens.contains(&idx) || looks_like_postcode_fragment(token) {
|
if skip_postcode_tokens.contains(&idx)
|
||||||
|
|| consumed_partial_tokens.contains(&idx)
|
||||||
|
|| looks_like_postcode_fragment(token)
|
||||||
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -486,6 +658,7 @@ fn parse_address_query(query: &str) -> AddressQuery {
|
||||||
|
|
||||||
AddressQuery {
|
AddressQuery {
|
||||||
full_postcode,
|
full_postcode,
|
||||||
|
postcode_area,
|
||||||
text_groups,
|
text_groups,
|
||||||
numeric_terms,
|
numeric_terms,
|
||||||
candidate_terms,
|
candidate_terms,
|
||||||
|
|
@ -897,9 +1070,15 @@ impl PropertyData {
|
||||||
&self.address_search_token_keys[offset..offset + length]
|
&self.address_search_token_keys[offset..offset + length]
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Search individual property addresses. Full postcode queries use a direct row index;
|
/// Search individual property addresses, returning `(row, score)` ranked best-first.
|
||||||
/// free-text queries use a small inverted index over distinctive address tokens.
|
///
|
||||||
pub fn search_addresses(&self, query: &str, limit: usize) -> Vec<usize> {
|
/// Candidate rows come from intersecting the posting lists of the distinctive words the
|
||||||
|
/// user typed in full (so "Cherry Hinton Road" narrows to rows containing both), unioned
|
||||||
|
/// with the exact-postcode rows when a complete postcode is present (so a postcode is a
|
||||||
|
/// boost, not an all-or-nothing gate). An appended partial postcode keeps in-area rows
|
||||||
|
/// ahead of the candidate cut and adds a scoring bias. With a road-type word and no house
|
||||||
|
/// number, results collapse to one row per street.
|
||||||
|
pub fn search_addresses(&self, query: &str, limit: usize) -> Vec<(usize, i32)> {
|
||||||
if limit == 0 {
|
if limit == 0 {
|
||||||
return Vec::new();
|
return Vec::new();
|
||||||
}
|
}
|
||||||
|
|
@ -912,25 +1091,45 @@ impl PropertyData {
|
||||||
return Vec::new();
|
return Vec::new();
|
||||||
}
|
}
|
||||||
|
|
||||||
let candidate_rows: Vec<u32> = if let Some(postcode) = parsed.full_postcode.as_deref() {
|
let mut candidate_rows = self.address_candidate_rows(&parsed.candidate_terms);
|
||||||
self.postcode_interner
|
|
||||||
|
// A complete postcode contributes its rows too, instead of replacing the road match.
|
||||||
|
if let Some(postcode) = parsed.full_postcode.as_deref() {
|
||||||
|
if let Some(rows) = self
|
||||||
|
.postcode_interner
|
||||||
.get(postcode)
|
.get(postcode)
|
||||||
.and_then(|key| self.postcode_row_index.get(&key))
|
.and_then(|key| self.postcode_row_index.get(&key))
|
||||||
.map(|rows| rows.to_vec())
|
{
|
||||||
.unwrap_or_default()
|
candidate_rows = if candidate_rows.is_empty() {
|
||||||
} else if let Some(rows) = self.best_address_token_rows(&parsed.candidate_terms) {
|
rows.clone()
|
||||||
rows.iter()
|
|
||||||
.take(ADDRESS_SEARCH_CANDIDATE_LIMIT)
|
|
||||||
.copied()
|
|
||||||
.collect()
|
|
||||||
} else {
|
} else {
|
||||||
Vec::new()
|
union_sorted(&candidate_rows, rows)
|
||||||
};
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if candidate_rows.is_empty() {
|
if candidate_rows.is_empty() {
|
||||||
return Vec::new();
|
return Vec::new();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// When the user appended a partial postcode, keep in-area rows ahead of the cut so the
|
||||||
|
// refinement still surfaces even for very common roads. Single pass (stable partition) so
|
||||||
|
// the postcode check — which allocates — runs exactly once per candidate.
|
||||||
|
if let Some(area) = parsed.postcode_area.as_deref() {
|
||||||
|
let mut in_area = Vec::new();
|
||||||
|
let mut others = Vec::new();
|
||||||
|
for &row in &candidate_rows {
|
||||||
|
if self.row_postcode_in_area(row as usize, area) {
|
||||||
|
in_area.push(row);
|
||||||
|
} else {
|
||||||
|
others.push(row);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
in_area.extend(others);
|
||||||
|
candidate_rows = in_area;
|
||||||
|
}
|
||||||
|
candidate_rows.truncate(ADDRESS_SEARCH_CANDIDATE_LIMIT);
|
||||||
|
|
||||||
let mut scored: Vec<(i32, usize, usize)> = candidate_rows
|
let mut scored: Vec<(i32, usize, usize)> = candidate_rows
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter_map(|row| {
|
.filter_map(|row| {
|
||||||
|
|
@ -948,18 +1147,29 @@ impl PropertyData {
|
||||||
.then(left.2.cmp(&right.2))
|
.then(left.2.cmp(&right.2))
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Collapse a road browse (road-type word, no house number) to one row per street.
|
||||||
|
let collapse_streets = parsed.numeric_terms.is_empty() && query_has_road_type(query);
|
||||||
|
|
||||||
let mut seen = FxHashSet::default();
|
let mut seen = FxHashSet::default();
|
||||||
let mut results = Vec::with_capacity(limit);
|
let mut results = Vec::with_capacity(limit);
|
||||||
for (_, _, row) in scored {
|
for (score, _, row) in scored {
|
||||||
let address = self.address(row).trim();
|
let address = self.address(row).trim();
|
||||||
if address.is_empty() {
|
if address.is_empty() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let key = format!("{}\n{}", address.to_ascii_lowercase(), self.postcode(row));
|
let key = if collapse_streets {
|
||||||
|
format!(
|
||||||
|
"{}\n{}",
|
||||||
|
street_key(address),
|
||||||
|
outcode_of(self.postcode(row))
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
format!("{}\n{}", address.to_ascii_lowercase(), self.postcode(row))
|
||||||
|
};
|
||||||
if !seen.insert(key) {
|
if !seen.insert(key) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
results.push(row);
|
results.push((row, score));
|
||||||
if results.len() == limit {
|
if results.len() == limit {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
@ -968,36 +1178,75 @@ impl PropertyData {
|
||||||
results
|
results
|
||||||
}
|
}
|
||||||
|
|
||||||
fn best_address_token_rows(&self, terms: &[String]) -> Option<&[u32]> {
|
/// True when the row's postcode begins with the compact partial-postcode `area`
|
||||||
let mut best: Option<&[u32]> = None;
|
/// (e.g. "NW1" or "NW16" matches "NW1 6XE").
|
||||||
|
fn row_postcode_in_area(&self, row: usize, area: &str) -> bool {
|
||||||
|
let mut compact = String::new();
|
||||||
|
for ch in self.postcode(row).chars() {
|
||||||
|
if !ch.is_whitespace() {
|
||||||
|
compact.push(ch.to_ascii_uppercase());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
compact.starts_with(area)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Candidate rows for the distinctive query words. Words typed in full intersect by their
|
||||||
|
/// exact posting lists (precise); a still-being-typed final word with no exact match seeds
|
||||||
|
/// from the smallest prefix-expanded posting list (so partial typing keeps working).
|
||||||
|
fn address_candidate_rows(&self, terms: &[String]) -> Vec<u32> {
|
||||||
|
let mut exact: Vec<&[u32]> = terms
|
||||||
|
.iter()
|
||||||
|
.filter_map(|term| self.address_token_index.get(term).map(Vec::as_slice))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if !exact.is_empty() {
|
||||||
|
exact.sort_by_key(|rows| rows.len());
|
||||||
|
let mut acc = exact[0].to_vec();
|
||||||
|
for rows in &exact[1..] {
|
||||||
|
if acc.is_empty() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
acc = intersect_sorted(&acc, rows);
|
||||||
|
}
|
||||||
|
return acc;
|
||||||
|
}
|
||||||
|
|
||||||
|
self.prefix_seed_rows(terms)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Seed rows from the smallest prefix-expanded term — used only when no word matched an
|
||||||
|
/// indexed token exactly (i.e. the user is still typing the final word).
|
||||||
|
fn prefix_seed_rows(&self, terms: &[String]) -> Vec<u32> {
|
||||||
|
let mut best: Option<Vec<u32>> = None;
|
||||||
for term in terms {
|
for term in terms {
|
||||||
if let Some(rows) = self.address_token_index.get(term) {
|
if term.len() < ADDRESS_SEARCH_PREFIX_MIN_LEN {
|
||||||
if best.is_none_or(|current| rows.len() < current.len()) {
|
|
||||||
best = Some(rows.as_slice());
|
|
||||||
}
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
let Some(tokens) = self.address_prefix_index.get(address_prefix_key(term)) else {
|
||||||
if term.len() < 4 {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
};
|
||||||
|
let mut union: Vec<u32> = Vec::new();
|
||||||
if let Some(tokens) = self.address_prefix_index.get(address_prefix_key(term)) {
|
|
||||||
for token in tokens {
|
for token in tokens {
|
||||||
if !token.starts_with(term) {
|
if !token.starts_with(term) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if let Some(rows) = self.address_token_index.get(token) {
|
if let Some(rows) = self.address_token_index.get(token) {
|
||||||
if best.is_none_or(|current| rows.len() < current.len()) {
|
union = if union.is_empty() {
|
||||||
best = Some(rows.as_slice());
|
rows.clone()
|
||||||
|
} else {
|
||||||
|
union_sorted(&union, rows)
|
||||||
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if !union.is_empty()
|
||||||
|
&& best
|
||||||
|
.as_ref()
|
||||||
|
.is_none_or(|current| union.len() < current.len())
|
||||||
|
{
|
||||||
|
best = Some(union);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
best.unwrap_or_default()
|
||||||
|
|
||||||
best
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn address_match_score(&self, row: usize, parsed: &AddressQuery) -> Option<i32> {
|
fn address_match_score(&self, row: usize, parsed: &AddressQuery) -> Option<i32> {
|
||||||
|
|
@ -1037,6 +1286,12 @@ impl PropertyData {
|
||||||
if numeric_matches == parsed.numeric_terms.len() && numeric_matches > 0 {
|
if numeric_matches == parsed.numeric_terms.len() && numeric_matches > 0 {
|
||||||
score += 50;
|
score += 50;
|
||||||
}
|
}
|
||||||
|
// Additive bias (never a filter) when the row sits in the appended partial postcode.
|
||||||
|
if let Some(area) = parsed.postcode_area.as_deref() {
|
||||||
|
if self.row_postcode_in_area(row, area) {
|
||||||
|
score += 400;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Some(score)
|
Some(score)
|
||||||
}
|
}
|
||||||
|
|
@ -1969,16 +2224,23 @@ impl PropertyData {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let address_token_count_before_prune = address_token_index.len();
|
// Keep every distinctive token: common road words ("high", "church", "station") are
|
||||||
address_token_index.retain(|_, rows| rows.len() <= ADDRESS_SEARCH_MAX_POSTINGS_PER_TOKEN);
|
// exactly what people search, and dropping them made those roads unsearchable while a
|
||||||
|
// prefix fallback surfaced the wrong street ("Highbury" for "High"). The candidate scan
|
||||||
|
// is bounded per query instead (ADDRESS_SEARCH_CANDIDATE_LIMIT), and stop words are
|
||||||
|
// already excluded from the index, so the largest posting lists stay modest.
|
||||||
|
let max_postings = address_token_index
|
||||||
|
.values()
|
||||||
|
.map(Vec::len)
|
||||||
|
.max()
|
||||||
|
.unwrap_or(0);
|
||||||
let address_prefix_index = build_address_prefix_index(&address_token_index);
|
let address_prefix_index = build_address_prefix_index(&address_token_index);
|
||||||
let address_search_interner = address_search_rodeo.into_reader();
|
let address_search_interner = address_search_rodeo.into_reader();
|
||||||
let address_postings_count: usize = address_token_index.values().map(Vec::len).sum();
|
let address_postings_count: usize = address_token_index.values().map(Vec::len).sum();
|
||||||
tracing::info!(
|
tracing::info!(
|
||||||
tokens = address_token_index.len(),
|
tokens = address_token_index.len(),
|
||||||
prefixes = address_prefix_index.len(),
|
prefixes = address_prefix_index.len(),
|
||||||
pruned_tokens =
|
max_postings_per_token = max_postings,
|
||||||
address_token_count_before_prune.saturating_sub(address_token_index.len()),
|
|
||||||
postings = address_postings_count,
|
postings = address_postings_count,
|
||||||
row_tokens = address_search_token_keys.len(),
|
row_tokens = address_search_token_keys.len(),
|
||||||
"Address search index built"
|
"Address search index built"
|
||||||
|
|
@ -2340,6 +2602,79 @@ mod tests {
|
||||||
assert_eq!(parsed.candidate_terms, vec!["downing".to_string()]);
|
assert_eq!(parsed.candidate_terms, vec!["downing".to_string()]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn address_query_recovers_appended_partial_postcode_as_bias() {
|
||||||
|
let parsed = parse_address_query("Baker Street NW1");
|
||||||
|
assert_eq!(parsed.full_postcode, None);
|
||||||
|
assert_eq!(parsed.postcode_area.as_deref(), Some("NW1"));
|
||||||
|
// The road words are still searchable; the postcode fragment did not consume them.
|
||||||
|
assert_eq!(parsed.candidate_terms, vec!["baker".to_string()]);
|
||||||
|
assert!(parsed.numeric_terms.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn address_query_recovers_outcode_plus_sector_without_a_phantom_house_number() {
|
||||||
|
let parsed = parse_address_query("High Street CR0 2");
|
||||||
|
assert_eq!(parsed.postcode_area.as_deref(), Some("CR02"));
|
||||||
|
// The lone sector digit must not be treated as a house number.
|
||||||
|
assert!(parsed.numeric_terms.is_empty());
|
||||||
|
assert_eq!(parsed.candidate_terms, vec!["high".to_string()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn full_postcode_takes_precedence_over_partial_bias() {
|
||||||
|
let parsed = parse_address_query("Baker Street NW1 6XE");
|
||||||
|
assert_eq!(parsed.full_postcode.as_deref(), Some("NW1 6XE"));
|
||||||
|
assert_eq!(parsed.postcode_area, None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn intersect_and_union_sorted_row_ids() {
|
||||||
|
assert_eq!(
|
||||||
|
intersect_sorted(&[1, 2, 3, 5], &[2, 3, 4, 5]),
|
||||||
|
vec![2, 3, 5]
|
||||||
|
);
|
||||||
|
assert_eq!(intersect_sorted(&[1, 2], &[3, 4]), Vec::<u32>::new());
|
||||||
|
assert_eq!(union_sorted(&[1, 3, 5], &[2, 3, 4]), vec![1, 2, 3, 4, 5]);
|
||||||
|
assert_eq!(union_sorted(&[], &[2, 4]), vec![2, 4]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn street_key_collapses_house_numbers_and_flats() {
|
||||||
|
assert_eq!(street_key("12 Baker Street"), "baker street");
|
||||||
|
assert_eq!(street_key("5 Baker Street"), "baker street");
|
||||||
|
assert_eq!(street_key("Flat 2, 10 Downing Street"), "downing street");
|
||||||
|
assert_eq!(street_key("221B Baker Street"), "baker street");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn street_key_keeps_ordinal_street_names() {
|
||||||
|
// Ordinals are part of the street name, not a house-number prefix.
|
||||||
|
assert_eq!(street_key("2nd Avenue"), "2nd avenue");
|
||||||
|
assert_eq!(street_key("12 3rd Avenue"), "3rd avenue");
|
||||||
|
assert!(is_ordinal_token("21st"));
|
||||||
|
assert!(!is_ordinal_token("21"));
|
||||||
|
assert!(!is_ordinal_token("221b"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn postcode_area_recovered_only_from_the_trailing_position() {
|
||||||
|
// A leading road designation must NOT be taken as an area refinement.
|
||||||
|
let parsed = parse_address_query("A4 Great West Road");
|
||||||
|
assert_eq!(parsed.postcode_area, None);
|
||||||
|
// A genuine trailing outcode still is.
|
||||||
|
let trailing = parse_address_query("Great West Road W4");
|
||||||
|
assert_eq!(trailing.postcode_area.as_deref(), Some("W4"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn road_type_detection() {
|
||||||
|
assert!(query_has_road_type("high street"));
|
||||||
|
assert!(query_has_road_type("acacia avenue"));
|
||||||
|
assert!(!query_has_road_type("acacia"));
|
||||||
|
assert!(!query_has_road_type("london"));
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn address_query_parsing_keeps_partial_terms_for_row_matching() {
|
fn address_query_parsing_keeps_partial_terms_for_row_matching() {
|
||||||
let parsed = parse_address_query("settlers cour");
|
let parsed = parse_address_query("settlers cour");
|
||||||
|
|
|
||||||
|
|
@ -507,8 +507,7 @@ async fn main() -> anyhow::Result<()> {
|
||||||
"property_borders.pmtiles",
|
"property_borders.pmtiles",
|
||||||
);
|
);
|
||||||
|
|
||||||
let noise_overlay_reader =
|
let noise_overlay_reader = init_required_tile_reader("Noise", &noise_overlay_tiles).await?;
|
||||||
init_required_tile_reader("Noise", &noise_overlay_tiles).await?;
|
|
||||||
let satellite_reader = init_required_tile_reader("Satellite", &satellite_tiles).await?;
|
let satellite_reader = init_required_tile_reader("Satellite", &satellite_tiles).await?;
|
||||||
let satellite_highres_reader =
|
let satellite_highres_reader =
|
||||||
init_required_tile_reader("Satellite high-res", &satellite_highres_tiles).await?;
|
init_required_tile_reader("Satellite high-res", &satellite_highres_tiles).await?;
|
||||||
|
|
|
||||||
|
|
@ -2,14 +2,26 @@ use std::sync::Arc;
|
||||||
|
|
||||||
use axum::extract::{Query, State};
|
use axum::extract::{Query, State};
|
||||||
use axum::response::Json;
|
use axum::response::Json;
|
||||||
|
use rustc_hash::FxHashSet;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
use crate::api_error::ApiError;
|
use crate::api_error::ApiError;
|
||||||
use crate::consts::PLACES_LIMIT;
|
use crate::consts::PLACES_LIMIT;
|
||||||
use crate::data::{normalize_search_text, slugify};
|
use crate::data::{
|
||||||
|
compute_trigrams, normalize_search_text, place_alias_tokens, slugify, trigram_similarity,
|
||||||
|
};
|
||||||
use crate::state::SharedState;
|
use crate::state::SharedState;
|
||||||
|
|
||||||
|
/// Trailing connective words dropped from a place query so "fish and chips" matches a place
|
||||||
|
/// stored (after `&` is normalized away) as "fish chips".
|
||||||
|
const QUERY_STOP_WORDS: &[&str] = &["and", "the", "of"];
|
||||||
|
|
||||||
|
/// Minimum trigram similarity for a fuzzy place match.
|
||||||
|
const FUZZY_MIN_SIMILARITY: f32 = 0.42;
|
||||||
|
/// Run the (linear) fuzzy pass only when the exact passes found fewer than this.
|
||||||
|
const FUZZY_TRIGGER_BELOW: usize = 3;
|
||||||
|
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
pub struct PlaceResult {
|
pub struct PlaceResult {
|
||||||
name: String,
|
name: String,
|
||||||
|
|
@ -29,6 +41,43 @@ pub struct AddressResult {
|
||||||
lon: f32,
|
lon: f32,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A single, category-tagged, relevance-scored result. The frontend renders these in order,
|
||||||
|
/// so ranking is unified across places, outcodes, postcodes and addresses instead of the old
|
||||||
|
/// fixed positional bucketing.
|
||||||
|
#[derive(Serialize)]
|
||||||
|
#[serde(tag = "type", rename_all = "lowercase")]
|
||||||
|
pub enum UnifiedResult {
|
||||||
|
Place {
|
||||||
|
name: String,
|
||||||
|
slug: String,
|
||||||
|
place_type: String,
|
||||||
|
lat: f32,
|
||||||
|
lon: f32,
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
city: Option<String>,
|
||||||
|
score: f32,
|
||||||
|
},
|
||||||
|
Postcode {
|
||||||
|
label: String,
|
||||||
|
score: f32,
|
||||||
|
},
|
||||||
|
Address {
|
||||||
|
address: String,
|
||||||
|
postcode: String,
|
||||||
|
lat: f32,
|
||||||
|
lon: f32,
|
||||||
|
score: f32,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
fn unified_score(result: &UnifiedResult) -> f32 {
|
||||||
|
match result {
|
||||||
|
UnifiedResult::Place { score, .. }
|
||||||
|
| UnifiedResult::Postcode { score, .. }
|
||||||
|
| UnifiedResult::Address { score, .. } => *score,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
pub struct PlacesResponse {
|
pub struct PlacesResponse {
|
||||||
places: Vec<PlaceResult>,
|
places: Vec<PlaceResult>,
|
||||||
|
|
@ -36,6 +85,9 @@ pub struct PlacesResponse {
|
||||||
postcodes: Vec<String>,
|
postcodes: Vec<String>,
|
||||||
#[serde(skip_serializing_if = "Vec::is_empty")]
|
#[serde(skip_serializing_if = "Vec::is_empty")]
|
||||||
addresses: Vec<AddressResult>,
|
addresses: Vec<AddressResult>,
|
||||||
|
/// Unified, relevance-ordered results. Preferred by the frontend; the arrays above remain
|
||||||
|
/// for backward compatibility.
|
||||||
|
results: Vec<UnifiedResult>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
#[derive(Deserialize)]
|
||||||
|
|
@ -44,6 +96,9 @@ pub struct PlacesParams {
|
||||||
q: String,
|
q: String,
|
||||||
/// If set, only return places that have travel time data for this mode.
|
/// If set, only return places that have travel time data for this mode.
|
||||||
mode: Option<String>,
|
mode: Option<String>,
|
||||||
|
/// Optional map-viewport centre used to bias ranking toward what the user is looking at.
|
||||||
|
lat: Option<f32>,
|
||||||
|
lng: Option<f32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn compact_postcode_query(query: &str) -> String {
|
fn compact_postcode_query(query: &str) -> String {
|
||||||
|
|
@ -93,6 +148,131 @@ fn postcode_starts_with_compact(postcode: &str, compact_query: &str) -> bool {
|
||||||
current.is_none()
|
current.is_none()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn is_postcode_fragmentish(token: &str) -> bool {
|
||||||
|
(2..=4).contains(&token.len())
|
||||||
|
&& token
|
||||||
|
.chars()
|
||||||
|
.next()
|
||||||
|
.is_some_and(|ch| ch.is_ascii_alphabetic())
|
||||||
|
&& token.chars().any(|ch| ch.is_ascii_digit())
|
||||||
|
&& token.chars().all(|ch| ch.is_ascii_alphanumeric())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Peel a trailing geographic refinement (outcode, or outcode + sector digit) off the query.
|
||||||
|
/// "camden nw1" → ("camden", Some("NW1")); the core matches the place, the refinement biases
|
||||||
|
/// ranking and drives the outcode/postcode lists — instead of breaking the match entirely.
|
||||||
|
fn split_geographic_refinement(query: &str) -> (String, Option<String>) {
|
||||||
|
let words: Vec<&str> = query.split_whitespace().collect();
|
||||||
|
if words.len() < 2 {
|
||||||
|
return (query.to_string(), None);
|
||||||
|
}
|
||||||
|
let last = words[words.len() - 1];
|
||||||
|
if words.len() >= 3 && last.len() == 1 && last.chars().all(|ch| ch.is_ascii_digit()) {
|
||||||
|
let prev = words[words.len() - 2];
|
||||||
|
if is_postcode_fragmentish(prev) {
|
||||||
|
let area = format!("{}{}", prev.to_ascii_uppercase(), last);
|
||||||
|
return (words[..words.len() - 2].join(" "), Some(area));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if is_postcode_fragmentish(last) {
|
||||||
|
return (
|
||||||
|
words[..words.len() - 1].join(" "),
|
||||||
|
Some(last.to_ascii_uppercase()),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
(query.to_string(), None)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Content words of a place query, dropping connectives so "fish and chips" matches "Fish & Chips".
|
||||||
|
fn query_content_tokens(query_search: &str) -> Vec<&str> {
|
||||||
|
query_search
|
||||||
|
.split(' ')
|
||||||
|
.filter(|token| !token.is_empty() && !QUERY_STOP_WORDS.contains(token))
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Base relevance tier for a place, or None if it does not match at all.
|
||||||
|
fn place_base_score(
|
||||||
|
search_text: &str,
|
||||||
|
name_lower: &str,
|
||||||
|
query_search: &str,
|
||||||
|
query_lower: &str,
|
||||||
|
query_tokens: &[&str],
|
||||||
|
) -> Option<f32> {
|
||||||
|
if query_search.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut exact = name_lower == query_lower;
|
||||||
|
let mut prefix = name_lower.starts_with(query_lower);
|
||||||
|
for alias in search_text.split(" | ") {
|
||||||
|
if alias == query_search {
|
||||||
|
exact = true;
|
||||||
|
}
|
||||||
|
if alias.starts_with(query_search) {
|
||||||
|
prefix = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if exact {
|
||||||
|
return Some(1000.0);
|
||||||
|
}
|
||||||
|
if prefix {
|
||||||
|
return Some(820.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if !query_tokens.is_empty() {
|
||||||
|
let all_covered = query_tokens.iter().all(|query_token| {
|
||||||
|
place_alias_tokens(search_text).any(|token| {
|
||||||
|
token == *query_token || (query_token.len() >= 2 && token.starts_with(query_token))
|
||||||
|
})
|
||||||
|
});
|
||||||
|
if all_covered {
|
||||||
|
return Some(640.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Small additive bonuses: more important place types and bigger populations rank higher.
|
||||||
|
fn place_modifiers(type_rank: u8, population: u32) -> f32 {
|
||||||
|
let type_bonus = f32::from(6u8.saturating_sub(type_rank)) * 8.0;
|
||||||
|
let pop_bonus = (population as f32 + 1.0).ln() * 4.0;
|
||||||
|
type_bonus + pop_bonus.min(64.0)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Distance-decay bonus toward the viewport / refinement centre. Capped below the gap between
|
||||||
|
/// match tiers so it reorders within a tier and breaks ties without overriding exact matches.
|
||||||
|
fn proximity_bonus(center: Option<(f32, f32)>, lat: f32, lon: f32) -> f32 {
|
||||||
|
let Some((center_lat, center_lon)) = center else {
|
||||||
|
return 0.0;
|
||||||
|
};
|
||||||
|
let dlat = lat - center_lat;
|
||||||
|
let dlon = (lon - center_lon) * center_lat.to_radians().cos();
|
||||||
|
let dist = (dlat * dlat + dlon * dlon).sqrt();
|
||||||
|
160.0 * (-dist / 0.3).exp()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Map an address match's raw specificity score onto the unified scale.
|
||||||
|
fn address_unified_score(raw: i32) -> f32 {
|
||||||
|
460.0 + raw.min(1000) as f32 * 0.47
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Resolve the outcode a compact partial postcode sits in (e.g. "NW16" → "nw1"), trying
|
||||||
|
/// progressively shorter prefixes against the known outcode set. Returns its index.
|
||||||
|
fn resolve_outcode_idx(name_lower: &[String], area: &str) -> Option<usize> {
|
||||||
|
let area_lower = area.to_lowercase();
|
||||||
|
let mut len = area_lower.len();
|
||||||
|
while len >= 2 {
|
||||||
|
let candidate = &area_lower[..len];
|
||||||
|
if let Some(idx) = name_lower.iter().position(|name| name == candidate) {
|
||||||
|
return Some(idx);
|
||||||
|
}
|
||||||
|
len -= 1;
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn get_places(
|
pub async fn get_places(
|
||||||
State(shared): State<Arc<SharedState>>,
|
State(shared): State<Arc<SharedState>>,
|
||||||
Query(params): Query<PlacesParams>,
|
Query(params): Query<PlacesParams>,
|
||||||
|
|
@ -106,154 +286,229 @@ pub async fn get_places(
|
||||||
|
|
||||||
let limit = PLACES_LIMIT;
|
let limit = PLACES_LIMIT;
|
||||||
let mode_filter = params.mode;
|
let mode_filter = params.mode;
|
||||||
|
let viewport = match (params.lat, params.lng) {
|
||||||
|
(Some(lat), Some(lng)) => Some((lat, lng)),
|
||||||
|
_ => None,
|
||||||
|
};
|
||||||
|
|
||||||
let places = tokio::task::spawn_blocking(move || {
|
let response = tokio::task::spawn_blocking(move || {
|
||||||
let t0 = std::time::Instant::now();
|
let t0 = std::time::Instant::now();
|
||||||
let query_lower = query.to_lowercase();
|
|
||||||
let query_search = normalize_search_text(&query);
|
|
||||||
let pd = &state.place_data;
|
let pd = &state.place_data;
|
||||||
let od = &state.outcode_data;
|
let od = &state.outcode_data;
|
||||||
let postcode_data = &state.postcode_data;
|
let postcode_data = &state.postcode_data;
|
||||||
let tt_store = &state.travel_time_store;
|
let tt_store = &state.travel_time_store;
|
||||||
let property_data = &state.data;
|
let property_data = &state.data;
|
||||||
|
|
||||||
// Linear scan — ~50-100k rows, <1ms
|
// Peel any appended outcode/partial-postcode so the place text matches on the core
|
||||||
// Tuple: (row_idx, is_exact, is_prefix, type_rank, population, name_len, slug)
|
// words while the refinement biases ranking and drives the outcode/postcode lists.
|
||||||
let mut matches: Vec<(usize, bool, bool, u8, u32, usize, String)> = pd
|
let (split_query, refinement) = split_geographic_refinement(&query);
|
||||||
.name_search
|
// Only honour the refinement when it resolves to a real outcode; otherwise (e.g. "the o2",
|
||||||
.iter()
|
// where "o2" looks postcode-ish but is not an outcode) treat the whole query as place text.
|
||||||
.enumerate()
|
let refinement_outcode = refinement
|
||||||
.filter_map(|(idx, search_text)| {
|
.as_deref()
|
||||||
if query_search.is_empty() || !search_text.contains(&query_search) {
|
.and_then(|area| resolve_outcode_idx(&od.name_lower, area));
|
||||||
return None;
|
let place_query = if refinement.is_some() && refinement_outcode.is_none() {
|
||||||
}
|
query.clone()
|
||||||
let slug = slugify(&pd.name[idx]);
|
} else {
|
||||||
|
split_query
|
||||||
|
};
|
||||||
|
let query_search = normalize_search_text(&place_query);
|
||||||
|
let query_lower = place_query.to_lowercase();
|
||||||
|
let query_tokens = query_content_tokens(&query_search);
|
||||||
|
|
||||||
// If mode filter is set, keep the historical travel destination set only.
|
// Bias centre: explicit viewport, else the resolved refinement outcode's centroid.
|
||||||
if let Some(ref mode) = mode_filter {
|
let bias_center = viewport.or_else(|| refinement_outcode.map(|idx| od.centroids[idx]));
|
||||||
if !pd.travel_destination[idx] || !tt_store.has_destination(mode, &slug) {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let is_exact = search_text
|
// ---- Places: candidate rows from the inverted token index, then exact/prefix/token-AND
|
||||||
.split(" | ")
|
// scoring — bounded by matched candidates, not the ~1M-row corpus. Fuzzy fallback uses the
|
||||||
.any(|alias| alias == query_search || pd.name_lower[idx] == query_lower);
|
// (small) trigram index over fuzzy-eligible rows only.
|
||||||
let is_prefix = search_text
|
let mut place_results: Vec<(f32, PlaceResult)> = Vec::new();
|
||||||
.split(" | ")
|
let mut matched_place_idx: FxHashSet<usize> = FxHashSet::default();
|
||||||
.any(|alias| alias.starts_with(&query_search))
|
let make_place = |idx: usize| PlaceResult {
|
||||||
|| pd.name_lower[idx].starts_with(&query_lower);
|
name: pd.name[idx].clone(),
|
||||||
Some((
|
slug: slugify(&pd.name[idx]),
|
||||||
idx,
|
place_type: pd.place_type.get(idx).to_string(),
|
||||||
is_exact,
|
lat: pd.lat[idx],
|
||||||
is_prefix,
|
lon: pd.lon[idx],
|
||||||
pd.type_rank[idx],
|
city: pd.city[idx].clone(),
|
||||||
pd.population[idx],
|
};
|
||||||
pd.name[idx].len(),
|
let passes_mode = |idx: usize| {
|
||||||
slug,
|
mode_filter.as_ref().is_none_or(|mode| {
|
||||||
))
|
pd.travel_destination[idx]
|
||||||
|
&& tt_store.has_destination(mode, &slugify(&pd.name[idx]))
|
||||||
})
|
})
|
||||||
.collect();
|
};
|
||||||
|
|
||||||
// Sort: exact first, then prefix, then type rank asc, then population desc, then name length asc
|
for row in pd.place_candidate_rows(&query_tokens) {
|
||||||
matches.sort_unstable_by(|lhs, rhs| {
|
let idx = row as usize;
|
||||||
rhs.1
|
let Some(base) = place_base_score(
|
||||||
.cmp(&lhs.1)
|
&pd.name_search[idx],
|
||||||
.then(rhs.2.cmp(&lhs.2))
|
&pd.name_lower[idx],
|
||||||
.then(lhs.3.cmp(&rhs.3))
|
&query_search,
|
||||||
.then(rhs.4.cmp(&lhs.4))
|
&query_lower,
|
||||||
.then(lhs.5.cmp(&rhs.5))
|
&query_tokens,
|
||||||
});
|
) else {
|
||||||
|
continue;
|
||||||
matches.truncate(limit);
|
};
|
||||||
|
if !passes_mode(idx) {
|
||||||
let mut results: Vec<PlaceResult> = matches
|
continue;
|
||||||
.iter()
|
|
||||||
.map(|(idx, .., slug)| PlaceResult {
|
|
||||||
name: pd.name[*idx].clone(),
|
|
||||||
slug: slug.clone(),
|
|
||||||
place_type: pd.place_type.get(*idx).to_string(),
|
|
||||||
lat: pd.lat[*idx],
|
|
||||||
lon: pd.lon[*idx],
|
|
||||||
city: pd.city[*idx].clone(),
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
// Also search outcodes (skip when mode filter is set — outcodes aren't travel destinations)
|
|
||||||
if mode_filter.is_none() {
|
|
||||||
let query_upper = query_lower.to_uppercase();
|
|
||||||
let mut outcode_results: Vec<PlaceResult> = od
|
|
||||||
.name_lower
|
|
||||||
.iter()
|
|
||||||
.enumerate()
|
|
||||||
.filter_map(|(idx, name)| {
|
|
||||||
if !name.starts_with(&query_lower) {
|
|
||||||
return None;
|
|
||||||
}
|
}
|
||||||
let is_exact = name.len() == query_lower.len();
|
let score = base
|
||||||
Some((idx, is_exact))
|
+ place_modifiers(pd.type_rank[idx], pd.population[idx])
|
||||||
})
|
+ proximity_bonus(bias_center, pd.lat[idx], pd.lon[idx]);
|
||||||
.collect::<Vec<_>>()
|
matched_place_idx.insert(idx);
|
||||||
.into_iter()
|
place_results.push((score, make_place(idx)));
|
||||||
.map(|(idx, _is_exact)| PlaceResult {
|
}
|
||||||
|
|
||||||
|
// Fuzzy (trigram) fallback only when the exact passes were thin and the query is long
|
||||||
|
// enough to be discriminating.
|
||||||
|
if place_results.len() < FUZZY_TRIGGER_BELOW && query_search.len() >= 4 {
|
||||||
|
let query_trigrams = compute_trigrams(&place_query);
|
||||||
|
for row in pd.fuzzy_candidate_rows(&query_trigrams) {
|
||||||
|
let idx = row as usize;
|
||||||
|
if matched_place_idx.contains(&idx) || !passes_mode(idx) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let similarity =
|
||||||
|
trigram_similarity(&query_trigrams, &compute_trigrams(&pd.name[idx]));
|
||||||
|
if similarity < FUZZY_MIN_SIMILARITY {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let score = 280.0
|
||||||
|
+ similarity * 120.0
|
||||||
|
+ place_modifiers(pd.type_rank[idx], pd.population[idx])
|
||||||
|
+ proximity_bonus(bias_center, pd.lat[idx], pd.lon[idx]);
|
||||||
|
matched_place_idx.insert(idx);
|
||||||
|
place_results.push((score, make_place(idx)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Outcodes (skipped under a mode filter) ----
|
||||||
|
let push_outcode = |results: &mut Vec<(f32, PlaceResult)>, idx: usize, base: f32| {
|
||||||
|
let (clat, clon) = od.centroids[idx];
|
||||||
|
results.push((
|
||||||
|
base + proximity_bonus(bias_center, clat, clon),
|
||||||
|
PlaceResult {
|
||||||
name: od.names[idx].clone(),
|
name: od.names[idx].clone(),
|
||||||
slug: od.names[idx].to_lowercase(),
|
slug: od.names[idx].to_lowercase(),
|
||||||
place_type: "outcode".to_string(),
|
place_type: "outcode".to_string(),
|
||||||
lat: od.centroids[idx].0,
|
lat: clat,
|
||||||
lon: od.centroids[idx].1,
|
lon: clon,
|
||||||
city: od.cities[idx].clone(),
|
city: od.cities[idx].clone(),
|
||||||
})
|
},
|
||||||
.collect();
|
));
|
||||||
|
};
|
||||||
// Sort outcodes: exact first, then by name length (shorter = broader area)
|
if mode_filter.is_none() {
|
||||||
outcode_results.sort_unstable_by(|a, b| {
|
if let Some(idx) = refinement_outcode {
|
||||||
let a_exact = a.name.eq_ignore_ascii_case(&query_upper);
|
// A refinement ("camden nw1") resolves to exactly one outcode — no NW10/NW11 noise.
|
||||||
let b_exact = b.name.eq_ignore_ascii_case(&query_upper);
|
push_outcode(&mut place_results, idx, 980.0);
|
||||||
b_exact.cmp(&a_exact).then(a.name.len().cmp(&b.name.len()))
|
} else if looks_like_postcode_prefix(&query) {
|
||||||
});
|
// A bare postcode-prefix query ("e1") lists matching outcodes (e1, e10, e11, ...).
|
||||||
|
let area_lower = compact_postcode_query(&query).to_lowercase();
|
||||||
// Prepend outcode results (up to 3) before place results, keeping total ≤ limit
|
for idx in 0..od.names.len() {
|
||||||
outcode_results.truncate(3);
|
let name = &od.name_lower[idx];
|
||||||
let place_slots = limit.saturating_sub(outcode_results.len());
|
let is_exact = *name == area_lower;
|
||||||
results.truncate(place_slots);
|
if !(name.starts_with(&area_lower) || area_lower.starts_with(name.as_str())) {
|
||||||
outcode_results.append(&mut results);
|
continue;
|
||||||
results = outcode_results;
|
}
|
||||||
|
push_outcode(
|
||||||
|
&mut place_results,
|
||||||
|
idx,
|
||||||
|
if is_exact { 980.0 } else { 760.0 },
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let postcodes: Vec<String> = if mode_filter.is_none() && looks_like_postcode_prefix(&query)
|
place_results.sort_by(|left, right| right.0.total_cmp(&left.0));
|
||||||
{
|
place_results.truncate(limit);
|
||||||
let compact_query = compact_postcode_query(&query);
|
|
||||||
postcode_data
|
|
||||||
.postcodes
|
|
||||||
.iter()
|
|
||||||
.filter(|postcode| postcode_starts_with_compact(postcode, &compact_query))
|
|
||||||
.filter(|postcode| !property_data.rows_for_postcode(postcode).is_empty())
|
|
||||||
.take(limit)
|
|
||||||
.cloned()
|
|
||||||
.collect()
|
|
||||||
} else {
|
|
||||||
Vec::new()
|
|
||||||
};
|
|
||||||
|
|
||||||
let addresses: Vec<AddressResult> = if mode_filter.is_none() {
|
// ---- Postcodes (full-postcode prefix list) ----
|
||||||
property_data
|
let mut postcode_results: Vec<(f32, String)> = Vec::new();
|
||||||
.search_addresses(&query, limit)
|
if mode_filter.is_none() && looks_like_postcode_prefix(&query) {
|
||||||
.into_iter()
|
let compact_query = compact_postcode_query(&query);
|
||||||
.map(|row| AddressResult {
|
for postcode in &postcode_data.postcodes {
|
||||||
|
if !postcode_starts_with_compact(postcode, &compact_query) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if property_data.rows_for_postcode(postcode).is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let compact_pc: String =
|
||||||
|
postcode.chars().filter(|ch| !ch.is_whitespace()).collect();
|
||||||
|
let score = if compact_pc == compact_query {
|
||||||
|
960.0
|
||||||
|
} else {
|
||||||
|
900.0
|
||||||
|
};
|
||||||
|
postcode_results.push((score, postcode.clone()));
|
||||||
|
if postcode_results.len() >= limit {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
postcode_results.sort_by(|left, right| right.0.total_cmp(&left.0));
|
||||||
|
|
||||||
|
// ---- Addresses ----
|
||||||
|
let mut address_results: Vec<(f32, AddressResult)> = Vec::new();
|
||||||
|
if mode_filter.is_none() {
|
||||||
|
for (row, raw) in property_data.search_addresses(&query, limit) {
|
||||||
|
let lat = property_data.lat[row];
|
||||||
|
let lon = property_data.lon[row];
|
||||||
|
let score = address_unified_score(raw) + proximity_bonus(bias_center, lat, lon);
|
||||||
|
address_results.push((
|
||||||
|
score,
|
||||||
|
AddressResult {
|
||||||
address: property_data.address(row).trim().to_string(),
|
address: property_data.address(row).trim().to_string(),
|
||||||
postcode: property_data.postcode(row).to_string(),
|
postcode: property_data.postcode(row).to_string(),
|
||||||
lat: property_data.lat[row],
|
lat,
|
||||||
lon: property_data.lon[row],
|
lon,
|
||||||
})
|
},
|
||||||
.collect()
|
));
|
||||||
} else {
|
}
|
||||||
Vec::new()
|
}
|
||||||
};
|
address_results.sort_by(|left, right| right.0.total_cmp(&left.0));
|
||||||
|
|
||||||
|
// ---- Unified merge: one relevance-ordered list across every source ----
|
||||||
|
let mut unified: Vec<UnifiedResult> = Vec::new();
|
||||||
|
for (score, place) in &place_results {
|
||||||
|
unified.push(UnifiedResult::Place {
|
||||||
|
name: place.name.clone(),
|
||||||
|
slug: place.slug.clone(),
|
||||||
|
place_type: place.place_type.clone(),
|
||||||
|
lat: place.lat,
|
||||||
|
lon: place.lon,
|
||||||
|
city: place.city.clone(),
|
||||||
|
score: *score,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
for (score, postcode) in &postcode_results {
|
||||||
|
unified.push(UnifiedResult::Postcode {
|
||||||
|
label: postcode.clone(),
|
||||||
|
score: *score,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
for (score, address) in &address_results {
|
||||||
|
unified.push(UnifiedResult::Address {
|
||||||
|
address: address.address.clone(),
|
||||||
|
postcode: address.postcode.clone(),
|
||||||
|
lat: address.lat,
|
||||||
|
lon: address.lon,
|
||||||
|
score: *score,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
unified.sort_by(|left, right| unified_score(right).total_cmp(&unified_score(left)));
|
||||||
|
unified.truncate(limit);
|
||||||
|
|
||||||
|
let places: Vec<PlaceResult> = place_results.into_iter().map(|(_, p)| p).collect();
|
||||||
|
let postcodes: Vec<String> = postcode_results.into_iter().map(|(_, p)| p).collect();
|
||||||
|
let addresses: Vec<AddressResult> = address_results.into_iter().map(|(_, a)| a).collect();
|
||||||
|
|
||||||
let elapsed = t0.elapsed();
|
let elapsed = t0.elapsed();
|
||||||
info!(
|
info!(
|
||||||
query = query.as_str(),
|
query = query.as_str(),
|
||||||
results = results.len(),
|
results = unified.len(),
|
||||||
|
places = places.len(),
|
||||||
postcodes = postcodes.len(),
|
postcodes = postcodes.len(),
|
||||||
addresses = addresses.len(),
|
addresses = addresses.len(),
|
||||||
scanned = pd.name_lower.len(),
|
scanned = pd.name_lower.len(),
|
||||||
|
|
@ -262,16 +517,17 @@ pub async fn get_places(
|
||||||
"GET /api/places"
|
"GET /api/places"
|
||||||
);
|
);
|
||||||
|
|
||||||
(results, postcodes, addresses)
|
PlacesResponse {
|
||||||
|
places,
|
||||||
|
postcodes,
|
||||||
|
addresses,
|
||||||
|
results: unified,
|
||||||
|
}
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
.map_err(|error| ApiError::Internal(error.to_string()))?;
|
.map_err(|error| ApiError::Internal(error.to_string()))?;
|
||||||
|
|
||||||
Ok(Json(PlacesResponse {
|
Ok(Json(response))
|
||||||
places: places.0,
|
|
||||||
postcodes: places.1,
|
|
||||||
addresses: places.2,
|
|
||||||
}))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|
@ -293,4 +549,88 @@ mod tests {
|
||||||
assert!(postcode_starts_with_compact("SW1A 1AA", "SW1A1"));
|
assert!(postcode_starts_with_compact("SW1A 1AA", "SW1A1"));
|
||||||
assert!(!postcode_starts_with_compact("SW1A 1AA", "SW1A2"));
|
assert!(!postcode_starts_with_compact("SW1A 1AA", "SW1A2"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn refinement_splits_off_trailing_outcode() {
|
||||||
|
assert_eq!(
|
||||||
|
split_geographic_refinement("camden nw1"),
|
||||||
|
("camden".to_string(), Some("NW1".to_string()))
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
split_geographic_refinement("high street cr0 2"),
|
||||||
|
("high street".to_string(), Some("CR02".to_string()))
|
||||||
|
);
|
||||||
|
// A bare outcode is not split (handled by the outcode/postcode path directly).
|
||||||
|
assert_eq!(
|
||||||
|
split_geographic_refinement("e14"),
|
||||||
|
("e14".to_string(), None)
|
||||||
|
);
|
||||||
|
// No trailing postcode → unchanged.
|
||||||
|
assert_eq!(
|
||||||
|
split_geographic_refinement("baker street"),
|
||||||
|
("baker street".to_string(), None)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn query_tokens_drop_connectives() {
|
||||||
|
assert_eq!(
|
||||||
|
query_content_tokens("fish and chips"),
|
||||||
|
vec!["fish", "chips"]
|
||||||
|
);
|
||||||
|
assert_eq!(query_content_tokens("isle of dogs"), vec!["isle", "dogs"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn base(search: &str, query: &str) -> Option<f32> {
|
||||||
|
let q = normalize_search_text(query);
|
||||||
|
let tokens = query_content_tokens(&q);
|
||||||
|
place_base_score(search, search, &q, &query.to_lowercase(), &tokens)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn place_match_tiers_order_exact_above_prefix_above_token_and() {
|
||||||
|
let exact = base("camden", "camden").unwrap();
|
||||||
|
let prefix = base("camden town", "camden").unwrap();
|
||||||
|
let token_and = base("camden market", "market camden").unwrap();
|
||||||
|
assert!(exact > prefix);
|
||||||
|
assert!(prefix > token_and);
|
||||||
|
// A reordered multi-word query still matches via token-AND.
|
||||||
|
assert!(base("manchester piccadilly", "piccadilly manchester").is_some());
|
||||||
|
// Pure infix substrings no longer match (candidates are token-based): "ford" must not
|
||||||
|
// surface "Stratford" — that was the old population-dominated noise.
|
||||||
|
assert!(base("stratford", "ford").is_none());
|
||||||
|
// Appended noise that matches nothing yields no match (the route strips postcodes first).
|
||||||
|
assert!(base("camden", "camden zzzz").is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn address_full_postcode_outranks_an_outcode_prefix() {
|
||||||
|
// raw 1200 ≈ road + full postcode + number; outcode prefix base is 760.
|
||||||
|
assert!(address_unified_score(1200) > 760.0);
|
||||||
|
// a road-only address (raw 200) ranks below an outcode prefix.
|
||||||
|
assert!(address_unified_score(200) < 760.0);
|
||||||
|
assert!(address_unified_score(1200) > address_unified_score(200));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn proximity_bonus_decays_and_never_flips_match_tiers() {
|
||||||
|
let here = proximity_bonus(Some((51.5, -0.1)), 51.5, -0.1);
|
||||||
|
let far = proximity_bonus(Some((51.5, -0.1)), 53.5, -2.0);
|
||||||
|
assert!(here > far);
|
||||||
|
assert!(here <= 160.0);
|
||||||
|
// Smaller than the 180-pt gap between exact (1000) and prefix (820).
|
||||||
|
assert!(here < 180.0);
|
||||||
|
assert_eq!(proximity_bonus(None, 51.5, -0.1), 0.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn resolve_outcode_idx_handles_sectorised_area_and_unknown() {
|
||||||
|
let names = vec!["nw1".to_string(), "e14".to_string()];
|
||||||
|
// "NW16" → outcode NW1 (strips the sector digit); "E14" → exact.
|
||||||
|
assert_eq!(resolve_outcode_idx(&names, "NW16"), Some(0));
|
||||||
|
assert_eq!(resolve_outcode_idx(&names, "E14"), Some(1));
|
||||||
|
// A postcode-ish token that is not a real outcode resolves to nothing (folds back).
|
||||||
|
assert_eq!(resolve_outcode_idx(&names, "O2"), None);
|
||||||
|
assert_eq!(resolve_outcode_idx(&names, "ZZ9"), None);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue