perfect-postcode/server-rs/src/data/postcodes.rs

336 lines
12 KiB
Rust

use anyhow::Context;
use rayon::prelude::*;
use rustc_hash::FxHashMap;
use serde::Deserialize;
use std::fs;
use std::path::Path;
use tracing::{debug, info};
use super::PlaceData;
/// Precomputed outcode data derived from postcode boundaries.
/// An outcode is the first part of a UK postcode (e.g. "E14" from "E14 2DG").
pub struct OutcodeData {
pub names: Vec<String>,
pub name_lower: Vec<String>,
pub centroids: Vec<(f32, f32)>,
pub cities: Vec<Option<String>>,
}
impl OutcodeData {
/// Derive outcode data by grouping postcodes by their outcode prefix and averaging centroids.
pub fn from_postcode_and_place_data(
postcode_data: &PostcodeData,
place_data: &PlaceData,
) -> Self {
// Group postcode centroids by outcode
let mut outcode_centroids: FxHashMap<String, Vec<(f32, f32)>> = FxHashMap::default();
for (idx, postcode) in postcode_data.postcodes.iter().enumerate() {
if let Some(space_idx) = postcode.find(' ') {
let outcode = &postcode[..space_idx];
outcode_centroids
.entry(outcode.to_string())
.or_default()
.push(postcode_data.centroids[idx]);
}
}
// Build sorted vecs
let mut entries: Vec<(String, (f32, f32))> = outcode_centroids
.into_iter()
.map(|(outcode, pts)| {
let count = pts.len() as f32;
let avg_lat = pts.iter().map(|(lat, _)| lat).sum::<f32>() / count;
let avg_lon = pts.iter().map(|(_, lon)| lon).sum::<f32>() / count;
(outcode, (avg_lat, avg_lon))
})
.collect();
entries.sort_unstable_by(|a, b| a.0.cmp(&b.0));
let names: Vec<String> = entries.iter().map(|(n, _)| n.clone()).collect();
let name_lower: Vec<String> = names.iter().map(|n| n.to_lowercase()).collect();
let centroids: Vec<(f32, f32)> = entries.iter().map(|(_, c)| *c).collect();
// Compute nearest city for each outcode (same algorithm as PlaceData)
let city_indices: Vec<usize> = place_data
.type_rank
.iter()
.enumerate()
.filter_map(|(idx, &rank)| if rank == 0 { Some(idx) } else { None })
.collect();
let cities: Vec<Option<String>> = centroids
.iter()
.map(|&(lat, lon)| {
let cos_lat = lat.to_radians().cos();
let mut best_dist_sq = f32::MAX;
let mut best_city: Option<&str> = None;
for &ci in &city_indices {
let dlat = place_data.lat[ci] - lat;
let dlon = (place_data.lon[ci] - lon) * cos_lat;
let dist_sq = dlat * dlat + dlon * dlon;
if dist_sq < best_dist_sq {
best_dist_sq = dist_sq;
best_city = Some(&place_data.name[ci]);
}
}
// ~100km threshold
if best_dist_sq < 0.81 {
best_city.map(|s| s.to_string())
} else {
None
}
})
.collect();
info!(outcodes = names.len(), "Outcode data derived from postcodes");
OutcodeData {
names,
name_lower,
centroids,
cities,
}
}
}
/// GeoJSON structures for parsing postcode boundary files
#[derive(Deserialize)]
struct FeatureCollection {
features: Vec<Feature>,
}
#[derive(Deserialize)]
struct Feature {
geometry: Geometry,
properties: Properties,
}
#[derive(Deserialize)]
#[serde(tag = "type")]
enum Geometry {
Polygon {
coordinates: Vec<Vec<[f64; 2]>>,
},
MultiPolygon {
coordinates: Vec<Vec<Vec<[f64; 2]>>>,
},
}
#[derive(Deserialize)]
struct Properties {
postcodes: String,
}
/// Postcode boundary data: polygon vertices and spatial index for fast queries.
pub struct PostcodeData {
/// Postcode strings
pub postcodes: Vec<String>,
/// Centroid (lat, lon) for lookups
pub centroids: Vec<(f32, f32)>,
/// Precomputed AABB per postcode: (south, west, north, east) as f32
pub aabbs: Vec<(f32, f32, f32, f32)>,
/// Precomputed GeoJSON geometry Value per postcode
pub geometries: Vec<serde_json::Value>,
/// Lookup from postcode string to index
pub postcode_to_idx: FxHashMap<String, usize>,
}
impl PostcodeData {
/// Load postcode boundaries from a directory of GeoJSON files.
/// Expects the directory to have a `units/` subdirectory containing .geojson files.
pub fn load(dir_path: &Path) -> anyhow::Result<Self> {
info!("Loading postcode boundaries from {:?}", dir_path);
let units_dir = dir_path.join("units");
if !units_dir.exists() {
anyhow::bail!(
"Expected 'units' subdirectory in postcode boundaries path: {:?}",
dir_path
);
}
let mut postcodes: Vec<String> = Vec::new();
let mut polygons: Vec<Vec<Vec<[f32; 2]>>> = Vec::new();
let mut centroids: Vec<(f32, f32)> = Vec::new();
// Read all .geojson files in the units directory
let mut entries: Vec<_> = fs::read_dir(&units_dir)
.with_context(|| format!("Failed to read directory: {:?}", units_dir))?
.filter_map(|entry| entry.ok())
.filter(|entry| {
entry
.path()
.extension()
.map(|ext| ext == "geojson")
.unwrap_or(false)
})
.collect();
entries.sort_by_key(|entry| entry.path());
info!(files = entries.len(), "Found GeoJSON files to process");
// Parse files in parallel
let file_results: Vec<_> = entries
.into_par_iter()
.map(|entry| {
let file_path = entry.path();
let content = fs::read_to_string(&file_path)
.with_context(|| format!("Failed to read file: {:?}", file_path))?;
let collection: FeatureCollection = serde_json::from_str(&content)
.with_context(|| format!("Failed to parse GeoJSON: {:?}", file_path))?;
let mut local_postcodes = Vec::new();
let mut local_polygons = Vec::new();
let mut local_centroids = Vec::new();
let mut local_aabbs: Vec<(f32, f32, f32, f32)> = Vec::new();
for feature in collection.features {
let postcode = feature.properties.postcodes;
// Extract all outer rings from the geometry
let rings: Vec<Vec<[f32; 2]>> = match feature.geometry {
Geometry::Polygon { coordinates } => coordinates
.first()
.map(|ring| {
vec![ring
.iter()
.map(|[lon, lat]| [*lon as f32, *lat as f32])
.collect()]
})
.unwrap_or_default(),
Geometry::MultiPolygon { coordinates } => coordinates
.iter()
.filter_map(|poly| {
poly.first().map(|ring| {
ring.iter()
.map(|[lon, lat]| [*lon as f32, *lat as f32])
.collect()
})
})
.collect(),
};
// Compute centroid across all vertices from all rings
let total_vertices: usize = rings.iter().map(|ring| ring.len()).sum();
let centroid = if total_vertices == 0 {
tracing::warn!(postcode = %postcode, "Postcode polygon has zero vertices, defaulting centroid to (0,0)");
(0.0, 0.0)
} else {
let mut sum_lat: f32 = 0.0;
let mut sum_lon: f32 = 0.0;
for ring in &rings {
for &[lon, lat] in ring {
sum_lat += lat;
sum_lon += lon;
}
}
let count = total_vertices as f32;
(sum_lat / count, sum_lon / count)
};
// Compute AABB across all rings
let (mut aabb_south, mut aabb_north) = (f32::INFINITY, f32::NEG_INFINITY);
let (mut aabb_west, mut aabb_east) = (f32::INFINITY, f32::NEG_INFINITY);
for ring in &rings {
for &[lon, lat] in ring {
if lat < aabb_south {
aabb_south = lat;
}
if lat > aabb_north {
aabb_north = lat;
}
if lon < aabb_west {
aabb_west = lon;
}
if lon > aabb_east {
aabb_east = lon;
}
}
}
local_postcodes.push(postcode);
local_polygons.push(rings);
local_centroids.push(centroid);
local_aabbs.push((aabb_south, aabb_west, aabb_north, aabb_east));
}
Ok::<_, anyhow::Error>((
local_postcodes,
local_polygons,
local_centroids,
local_aabbs,
))
})
.collect::<Result<Vec<_>, _>>()?;
let mut aabbs: Vec<(f32, f32, f32, f32)> = Vec::new();
// Flatten results
for (local_postcodes, local_polygons, local_centroids, local_aabbs) in file_results {
postcodes.extend(local_postcodes);
polygons.extend(local_polygons);
centroids.extend(local_centroids);
aabbs.extend(local_aabbs);
}
debug!(
postcodes = postcodes.len(),
"Extracted postcodes from GeoJSON"
);
// Build postcode -> index lookup
let mut postcode_to_idx: FxHashMap<String, usize> = FxHashMap::default();
for (idx, postcode) in postcodes.iter().enumerate() {
postcode_to_idx.insert(postcode.clone(), idx);
}
// Precompute GeoJSON geometry for each postcode
let geometries: Vec<serde_json::Value> = polygons
.iter()
.map(|rings| {
if rings.len() == 1 {
let coords: Vec<serde_json::Value> = rings[0]
.iter()
.map(|[lon, lat]| {
serde_json::Value::Array(vec![
serde_json::Value::from(*lon as f64),
serde_json::Value::from(*lat as f64),
])
})
.collect();
serde_json::json!({"type": "Polygon", "coordinates": [coords]})
} else {
let polys: Vec<serde_json::Value> = rings
.iter()
.map(|ring| {
let coords: Vec<serde_json::Value> = ring
.iter()
.map(|[lon, lat]| {
serde_json::Value::Array(vec![
serde_json::Value::from(*lon as f64),
serde_json::Value::from(*lat as f64),
])
})
.collect();
serde_json::Value::Array(vec![serde_json::Value::Array(coords)])
})
.collect();
serde_json::json!({"type": "MultiPolygon", "coordinates": polys})
}
})
.collect();
info!(postcodes = postcodes.len(), "Postcode boundary data ready");
Ok(PostcodeData {
postcodes,
centroids,
aabbs,
geometries,
postcode_to_idx,
})
}
}