Lots of improvements

This commit is contained in:
Andras Schmelczer 2026-03-10 22:05:51 +00:00
parent ef921361ec
commit 80a5a2a774
21 changed files with 489 additions and 337 deletions

View file

@ -464,7 +464,7 @@ impl PropertyData {
tracing::info!("Concatenating all data sources");
let buy_count = listings_buy.height();
let rent_count = listings_rent.height();
let mut combined = concat(
let combined = concat(
[
properties_joined.lazy(),
listings_buy.lazy(),
@ -495,36 +495,8 @@ impl PropertyData {
let numeric_names = features::all_numeric_feature_names();
let enum_names = features::all_enum_feature_names();
// Fill in NaN/empty placeholder columns for features that don't exist in all
// sources (e.g. Listing date only comes from listings, Estimated current price
// only from properties). Without this, diagonal concat leaves them absent.
{
let schema = combined.schema();
let mut fill_exprs: Vec<Expr> = Vec::new();
for &name in &numeric_names {
if schema.get(name).is_none() {
tracing::info!(feature = %name, "Adding NaN placeholder for missing numeric feature");
fill_exprs.push(lit(f32::NAN).alias(name));
}
}
for &name in &enum_names {
if schema.get(name).is_none() {
tracing::info!(feature = %name, "Adding empty placeholder for missing enum feature");
fill_exprs.push(lit("").alias(name));
}
}
if !fill_exprs.is_empty() {
combined = combined
.lazy()
.with_columns(fill_exprs)
.collect()
.context("Failed to add placeholder columns for missing features")?;
}
}
let schema = combined.schema();
// Validate: every configured feature exists in combined schema
for name in &numeric_names {
match schema.get(name) {
Some(dtype) if is_numeric_dtype(dtype) => {}

View file

@ -92,7 +92,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
step: 10000.0,
description: "Inflation-adjusted estimate of the current property value",
detail: "Estimated by applying a repeat-sales price index to the last known sale price, plus a renovation premium for properties with post-sale improvements detected from EPC records (extensions, renovations, remodeling). The index tracks price changes within each postcode sector and property type. Renovation premiums are estimated per area from observed repeat-sale pairs and decay over time. Properties sold recently will have estimates close to their sale price; older sales are adjusted more.",
detail: "Estimated by applying a repeat-sales price index to the last known sale price, plus a renovation premium for properties with post-sale improvements detected from EPC records (extensions, renovations, remodelling). The index tracks price changes within each postcode sector and property type. Renovation premiums are estimated per area from observed repeat-sale pairs and decay over time. Properties sold recently will have estimates close to their sale price; older sales are adjusted more.",
source: "price-paid",
prefix: "£",
suffix: "",
@ -259,7 +259,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
step: 50.0,
description: "Listed monthly rent for properties currently for rent",
detail: "The advertised rental price normalized to monthly for properties currently listed for rent on online property portals. Weekly rents are converted (×52/12), yearly (/12), daily (×365.25/12), and quarterly (/3). Only populated for 'For rent' listings.",
detail: "The advertised rental price normalised to monthly for properties currently listed for rent on online property portals. Weekly rents are converted (×52/12), yearly (/12), daily (×365.25/12), and quarterly (/3). Only populated for 'For rent' listings.",
source: "online-listings",
prefix: "£",
suffix: "/mo",
@ -325,82 +325,14 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
name: "Transport",
features: &[
FeatureConfig {
name: "Public transport to Bank (mins)",
bounds: Bounds::Fixed {
min: 0.0,
max: 180.0,
},
step: 2.0,
description: "Public transport journey time to Bank station",
detail: "Journey time in minutes by public transport to Bank station in the City of London, using TfL's Journey Planner API. Calculated for weekday morning commute times.",
source: "tfl-journey-times",
prefix: "",
suffix: " mins",
raw: false,
absolute: false,
modes: &[],
linked: "",
},
FeatureConfig {
name: "Public transport to Fitzrovia (mins)",
bounds: Bounds::Fixed {
min: 0.0,
max: 180.0,
},
step: 2.0,
description: "Public transport journey time to Fitzrovia",
detail: "Journey time in minutes by public transport to Fitzrovia in central London, using TfL's Journey Planner API. Calculated for weekday morning commute times.",
source: "tfl-journey-times",
prefix: "",
suffix: " mins",
raw: false,
absolute: false,
modes: &[],
linked: "",
},
FeatureConfig {
name: "Cycling to Bank (mins)",
bounds: Bounds::Fixed {
min: 0.0,
max: 180.0,
},
step: 1.0,
description: "Cycling time to Bank station",
detail: "Cycling journey time in minutes to Bank station, as calculated by the TfL Journey Planner API. Uses TfL's default cycling speed and route preferences.",
source: "tfl-journey-times",
prefix: "",
suffix: " mins",
raw: false,
absolute: false,
modes: &[],
linked: "",
},
FeatureConfig {
name: "Cycling to Fitzrovia (mins)",
bounds: Bounds::Fixed {
min: 0.0,
max: 180.0,
},
step: 1.0,
description: "Cycling time to Fitzrovia",
detail: "Cycling journey time in minutes to Fitzrovia, as calculated by the TfL Journey Planner API. Uses TfL's default cycling speed and route preferences.",
source: "tfl-journey-times",
prefix: "",
suffix: " mins",
raw: false,
absolute: false,
modes: &[],
linked: "",
},
FeatureConfig {
name: "Number of public transport stations within 2km",
name: "Train or tube stations within 1km",
bounds: Bounds::Percentile {
low: 5.0,
high: 95.0,
},
step: 1.0,
description: "Number of public transport stops within 2km",
detail: "Count of bus stops, rail stations, tube stations, tram stops, and other public transport access points within a 2km radius of the property's postcode. Derived from the NaPTAN (National Public Transport Access Nodes) dataset.",
description: "Number of train or tube stations within 1km",
detail: "Count of rail stations and Tube/metro/tram stops within a 1km radius of the property's postcode. Derived from the NaPTAN (National Public Transport Access Nodes) dataset. Does not include bus stops.",
source: "naptan",
prefix: "",
suffix: "",
@ -409,6 +341,23 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
modes: &[],
linked: "",
},
FeatureConfig {
name: "Distance to nearest train or tube station (km)",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 0.1,
description: "Distance to the closest train or tube station",
detail: "Straight-line distance in kilometres from the property's postcode centroid to the nearest rail station or Tube/metro/tram stop. Derived from the NaPTAN (National Public Transport Access Nodes) dataset.",
source: "naptan",
prefix: "",
suffix: " km",
raw: false,
absolute: false,
modes: &[],
linked: "",
},
],
},
FeatureGroup {
@ -906,14 +855,31 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
linked: "",
},
FeatureConfig {
name: "% Asian",
name: "% South Asian",
bounds: Bounds::Fixed {
min: 0.0,
max: 100.0,
},
step: 1.0,
description: "Percentage of population identifying as Asian",
detail: "From the 2021 Census. Percentage of the local authority population identifying as Asian or Asian British (Indian, Pakistani, Bangladeshi, Chinese, or any other Asian background).",
description: "Percentage of population identifying as South Asian",
detail: "From the 2021 Census. Percentage of the local authority population identifying as Indian, Pakistani, Bangladeshi, or any other Asian background.",
source: "ethnicity",
prefix: "",
suffix: "%",
raw: false,
absolute: false,
modes: &[],
linked: "",
},
FeatureConfig {
name: "% East Asian",
bounds: Bounds::Fixed {
min: 0.0,
max: 100.0,
},
step: 1.0,
description: "Percentage of population identifying as East Asian",
detail: "From the 2021 Census. Percentage of the local authority population identifying as Chinese.",
source: "ethnicity",
prefix: "",
suffix: "%",
@ -1074,7 +1040,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
pub static ENUM_FEATURE_GROUPS: &[EnumFeatureGroup] = &[
EnumFeatureGroup {
name: "Property",
name: "Properties in the area",
features: &[
EnumFeatureConfig {
name: "Listing status",
@ -1084,7 +1050,7 @@ pub static ENUM_FEATURE_GROUPS: &[EnumFeatureGroup] = &[
source: "online-listings",
},
EnumFeatureConfig {
name: "Leashold/Freehold",
name: "Leasehold/Freehold",
order: Some(&["Freehold", "Leasehold"]),
description: "Whether the property is leasehold or freehold",
detail: "From HM Land Registry Price Paid data. Freehold means you own the building and the land it stands on. Leasehold means you own the building but not the land — you have a lease from the freeholder for a set number of years.",

View file

@ -417,16 +417,16 @@ async fn main() -> anyhow::Result<()> {
let state_short_url = state.clone();
let state_ai_filters = state.clone();
let state_streetview = state.clone();
let state_subscription = state.clone();
let state_newsletter = state.clone();
let state_travel_modes = state.clone();
let state_travel_destinations = state.clone();
let state_checkout = state.clone();
let state_stripe_webhook = state.clone();
let state_pricing = state.clone();
let state_invites_create = state.clone();
let state_invite_get = state.clone();
let state_redeem_invite = state.clone();
let state_rightmove = state.clone();
let state_journey = state.clone();
let api = Router::new()
.route(
@ -461,6 +461,14 @@ async fn main() -> anyhow::Result<()> {
"/api/travel-modes",
get(move || routes::get_travel_modes(state_travel_modes.clone())),
)
.route(
"/api/travel-destinations",
get(move |query| routes::get_travel_destinations(state_travel_destinations.clone(), query)),
)
.route(
"/api/journey",
get(move |query| routes::get_journey(state_journey.clone(), query)),
)
.route(
"/api/hexagon-properties",
get(move |ext, query| {
@ -502,16 +510,6 @@ async fn main() -> anyhow::Result<()> {
"/api/streetview",
get(move |query| routes::get_streetview(state_streetview.clone(), query)),
)
.route(
"/api/rightmove-location",
get(move |query| routes::get_rightmove_typeahead(state_rightmove.clone(), query)),
)
.route(
"/api/subscription",
patch(move |ext, body| {
routes::patch_subscription(state_subscription.clone(), ext, body)
}),
)
.route(
"/api/newsletter",
patch(move |ext, body| {

View file

@ -5,6 +5,7 @@ mod features;
mod hexagon_stats;
pub(crate) mod hexagons;
mod invites;
mod journey;
mod me;
mod pb_proxy;
mod places;
@ -20,10 +21,9 @@ mod streetview;
mod stripe_webhook;
mod newsletter;
pub(crate) mod pricing;
mod rightmove_typeahead;
mod subscription;
mod tiles;
pub(crate) mod travel_time;
mod travel_destinations;
mod travel_modes;
pub use ai_filters::{build_ollama_schema, build_system_prompt, post_ai_filters};
@ -44,10 +44,10 @@ pub use screenshot::{fetch_screenshot_bytes, get_screenshot};
pub use shorten::{get_short_url, post_shorten};
pub use streetview::get_streetview;
pub use invites::{get_invite, post_invites, post_redeem_invite};
pub use journey::get_journey;
pub use newsletter::patch_newsletter;
pub use pricing::get_pricing;
pub use stripe_webhook::post_stripe_webhook;
pub use subscription::patch_subscription;
pub use tiles::{get_style, get_tile, init_tile_reader};
pub use rightmove_typeahead::get_rightmove_typeahead;
pub use travel_destinations::get_travel_destinations;
pub use travel_modes::get_travel_modes;

View file

@ -146,7 +146,7 @@ pub fn build_system_prompt(features: &FeaturesResponse) -> String {
parts.push(
"User: \"cheap freehold house under 400k\"\n\
Output: {\"numeric_filters\": [{\"name\": \"Last known price\", \"bound\": \"max\", \"value\": 400000}], \
\"enum_filters\": [{\"name\": \"Leashold/Freehold\", \"values\": [\"Freehold\"]}, \
\"enum_filters\": [{\"name\": \"Leasehold/Freehold\", \"values\": [\"Freehold\"]}, \
{\"name\": \"Property type\", \"values\": [\"Detached\", \"Semi-Detached\", \"Terraced\"]}], \
\"notes\": \"\"}"
.to_string(),
@ -252,13 +252,13 @@ pub async fn post_ai_filters(
/// ```json
/// {
/// "numeric_filters": [{"name": "Last known price", "bound": "max", "value": 300000}],
/// "enum_filters": [{"name": "Leashold/Freehold", "values": ["Freehold"]}]
/// "enum_filters": [{"name": "Leasehold/Freehold", "values": ["Freehold"]}]
/// }
/// ```
///
/// Output format (FeatureFilters):
/// ```json
/// { "Last known price": [0, 300000], "Leashold/Freehold": ["Freehold"] }
/// { "Last known price": [0, 300000], "Leasehold/Freehold": ["Freehold"] }
/// ```
fn validate_and_convert(raw: &Value, features: &FeaturesResponse) -> Value {
let mut result = serde_json::Map::new();

View file

@ -18,7 +18,7 @@ use crate::parsing::{
bounds_intersect, cell_for_row, h3_cell_bounds, needs_parent, parse_field_indices,
parse_filters, require_bounds, row_passes_filters, validate_h3_resolution,
};
use crate::routes::travel_time::TravelTimeAgg;
use crate::routes::travel_time::{parse_travel_entries, TravelTimeAgg};
use crate::state::AppState;
#[derive(Serialize)]
@ -40,62 +40,6 @@ pub struct HexagonParams {
travel: Option<String>,
}
struct TravelEntry {
mode: String,
slug: String,
use_best: bool,
filter_min: Option<f32>,
filter_max: Option<f32>,
}
/// Parse `travel` param into a list of travel entries.
/// Format: `mode:slug` or `mode:slug:best` or `mode:slug:min:max` or `mode:slug:best:min:max`
fn parse_travel_entries(travel_str: &str) -> Result<Vec<TravelEntry>, String> {
let mut entries = Vec::new();
let mut seen_keys = Vec::new();
for segment in travel_str.split('|') {
let parts: Vec<&str> = segment.split(':').collect();
if parts.len() < 2 {
return Err(format!(
"each travel entry must be 'mode:slug' or 'mode:slug:min:max', got '{}'",
segment
));
}
let mode = parts[0].trim().to_string();
let slug = parts[1].trim().to_string();
let use_best = parts.len() >= 3 && parts[2].trim() == "best";
let filter_offset = if use_best { 1 } else { 0 };
let (filter_min, filter_max) = if parts.len() >= 4 + filter_offset {
let min: f32 = parts[2 + filter_offset]
.trim()
.parse()
.map_err(|_| format!("invalid travel filter min in '{}'", segment))?;
let max: f32 = parts[3 + filter_offset]
.trim()
.parse()
.map_err(|_| format!("invalid travel filter max in '{}'", segment))?;
(Some(min), Some(max))
} else {
(None, None)
};
let key = format!("{}:{}", mode, slug);
if seen_keys.contains(&key) {
return Err(format!("duplicate travel entry '{}'", key));
}
seen_keys.push(key);
entries.push(TravelEntry {
mode,
slug,
use_best,
filter_min,
filter_max,
});
}
Ok(entries)
}
/// Build feature maps from aggregated cell data, filtering to only cells that intersect the query bounds.
#[allow(clippy::too_many_arguments)]