This commit is contained in:
Andras Schmelczer 2026-05-26 19:45:13 +01:00
parent c645b0f1d4
commit 39ef5c6646
79 changed files with 5660 additions and 2199 deletions

View file

@ -61,6 +61,9 @@ pub struct ActualListingData {
/// overlaid where available. This lets the listings endpoint use the same filter
/// execution path as the property endpoints.
pub filter_feature_data: Vec<u16>,
/// Row-major dynamic postcode POI metrics aligned with
/// PropertyData::poi_metrics.feature_names.
pub poi_filter_feature_data: Vec<u16>,
pub grid: GridIndex,
}
@ -109,16 +112,16 @@ impl ActualListingData {
let listing_status = InternedColumn::build(&opt_to_string(&listing_status_raw));
let filter_feature_data = build_filter_feature_data(
&df,
property_data,
&postcode,
&address,
&property_type_raw,
&leasehold_freehold_raw,
&rooms_total,
&floor_area_sqm,
&asking_price,
&asking_price_per_sqm,
);
)?;
let poi_filter_feature_data = build_poi_filter_feature_data(&df, property_data)?;
let grid = GridIndex::build(&lat, &lon, GRID_CELL_SIZE);
@ -144,6 +147,7 @@ impl ActualListingData {
listing_date_iso,
features,
filter_feature_data,
poi_filter_feature_data,
grid,
})
}
@ -174,49 +178,37 @@ impl ActualListingData {
#[allow(clippy::too_many_arguments)]
fn build_filter_feature_data(
df: &DataFrame,
property_data: Option<&PropertyData>,
postcode: &[String],
address: &[Option<String>],
property_type: &[Option<String>],
leasehold_freehold: &[Option<String>],
rooms_total: &[Option<i32>],
floor_area_sqm: &[Option<f32>],
asking_price: &[Option<i64>],
asking_price_per_sqm: &[Option<f32>],
) -> Vec<u16> {
) -> Result<Vec<u16>> {
let Some(property_data) = property_data else {
return Vec::new();
return Ok(Vec::new());
};
let num_features = property_data.num_features;
let mut feature_data = vec![NAN_U16; postcode.len() * num_features];
let mut joined_rows = 0usize;
let row_count = df.height();
let mut feature_data = vec![NAN_U16; row_count * num_features];
let quant = property_data.quant_ref();
let mut encoded_columns = 0usize;
for (row, postcode_value) in postcode.iter().enumerate() {
let Some(address_value) = address[row]
.as_deref()
.map(str::trim)
.filter(|v| !v.is_empty())
else {
continue;
};
let query = format!("{address_value} {postcode_value}");
let Some(&property_row) = property_data.search_addresses(&query, 1).first() else {
continue;
};
if property_data.postcode(property_row) != postcode_value {
continue;
for (feat_idx, name) in property_data.feature_names.iter().enumerate() {
if feat_idx < property_data.num_numeric {
if let Some(values) = extract_optional_feature_f32(df, name)? {
encode_numeric_feature(&mut feature_data, property_data, &quant, feat_idx, values);
encoded_columns += 1;
}
} else if let Some(values) = extract_optional_feature_str(df, name)? {
encode_enum_feature(&mut feature_data, property_data, feat_idx, values);
encoded_columns += 1;
}
let dst = row * num_features;
let src = property_row * num_features;
feature_data[dst..dst + num_features]
.copy_from_slice(&property_data.feature_data[src..src + num_features]);
joined_rows += 1;
}
let quant = property_data.quant_ref();
overlay_numeric_feature(
&mut feature_data,
property_data,
@ -281,11 +273,50 @@ fn build_filter_feature_data(
);
info!(
rows = postcode.len(),
joined_rows, "Actual listings joined to property feature matrix"
rows = row_count,
encoded_columns, "Actual listings feature matrix read from enriched parquet"
);
feature_data
Ok(feature_data)
}
fn build_poi_filter_feature_data(
df: &DataFrame,
property_data: Option<&PropertyData>,
) -> Result<Vec<u16>> {
let Some(property_data) = property_data else {
return Ok(Vec::new());
};
let poi_metrics = &property_data.poi_metrics;
let num_features = poi_metrics.num_features();
if num_features == 0 {
return Ok(Vec::new());
}
let row_count = df.height();
let mut feature_data = vec![NAN_U16; row_count * num_features];
let quant = poi_metrics.quant_ref();
let mut encoded_columns = 0usize;
for (metric_idx, name) in poi_metrics.feature_names.iter().enumerate() {
let Some(values) = extract_optional_feature_f32(df, name)? else {
continue;
};
for (row, value) in values.into_iter().enumerate() {
let dst = row * num_features + metric_idx;
feature_data[dst] = value
.map(|value| encode_numeric_value(&quant, metric_idx, value))
.unwrap_or(NAN_U16);
}
encoded_columns += 1;
}
info!(
rows = row_count,
encoded_columns, "Actual listings POI metrics read from enriched parquet"
);
Ok(feature_data)
}
fn feature_index(property_data: &PropertyData, name: &str) -> Option<usize> {
@ -323,6 +354,53 @@ fn overlay_numeric_feature<I>(
}
}
fn encode_numeric_feature<I>(
feature_data: &mut [u16],
property_data: &PropertyData,
quant: &QuantRef<'_>,
feat_idx: usize,
values: I,
) where
I: IntoIterator<Item = Option<f32>>,
{
let num_features = property_data.num_features;
for (row, value) in values.into_iter().enumerate() {
let dst = row * num_features + feat_idx;
feature_data[dst] = value
.map(|value| encode_numeric_value(quant, feat_idx, value))
.unwrap_or(NAN_U16);
}
}
fn extract_optional_feature_f32(df: &DataFrame, name: &str) -> Result<Option<Vec<Option<f32>>>> {
let Ok(column) = df.column(name) else {
return Ok(None);
};
if matches!(column.dtype(), DataType::Datetime(_, _) | DataType::Date) {
let projected = df
.clone()
.lazy()
.select([(col(name).dt().year().cast(DataType::Float32)
+ (col(name).dt().month().cast(DataType::Float32) - lit(1.0f32)) / lit(12.0f32))
.alias("__feature")])
.collect()
.with_context(|| format!("Failed to convert datetime feature '{name}'"))?;
return Ok(Some(extract_opt_f32(&projected, "__feature")?));
}
let cast = column
.cast(&DataType::Float32)
.with_context(|| format!("Failed to cast feature '{name}' to Float32"))?;
let values = cast
.f32()
.with_context(|| format!("Feature '{name}' is not Float32"))?
.into_iter()
.map(|value| value.filter(|v| v.is_finite()))
.collect();
Ok(Some(values))
}
fn overlay_enum_feature<'a, I>(
feature_data: &mut [u16],
property_data: &PropertyData,
@ -355,6 +433,46 @@ fn overlay_enum_feature<'a, I>(
}
}
fn encode_enum_feature(
feature_data: &mut [u16],
property_data: &PropertyData,
feat_idx: usize,
values: Vec<Option<String>>,
) {
let Some(enum_values) = property_data.enum_values.get(&feat_idx) else {
return;
};
let num_features = property_data.num_features;
for (row, value) in values.into_iter().enumerate() {
let dst = row * num_features + feat_idx;
feature_data[dst] = value
.as_deref()
.map(str::trim)
.filter(|text| !text.is_empty())
.and_then(|text| enum_values.iter().position(|candidate| candidate == text))
.map(|position| position as u16)
.unwrap_or(NAN_U16);
}
}
fn extract_optional_feature_str(df: &DataFrame, name: &str) -> Result<Option<Vec<Option<String>>>> {
let Ok(column) = df.column(name) else {
return Ok(None);
};
let cast = column
.cast(&DataType::String)
.with_context(|| format!("Failed to cast feature '{name}' to String"))?;
let strings = cast
.str()
.with_context(|| format!("Feature '{name}' is not a string column"))?;
Ok(Some(
strings
.into_iter()
.map(|value| value.and_then(|text| (!text.trim().is_empty()).then(|| text.to_string())))
.collect(),
))
}
fn encode_numeric_value(quant: &QuantRef<'_>, feat_idx: usize, value: f32) -> u16 {
if !value.is_finite() {
return NAN_U16;
@ -517,8 +635,13 @@ mod tests {
use std::path::PathBuf;
fn sample_path() -> Option<PathBuf> {
let path = PathBuf::from("../finder/data/online_listings_buy.parquet");
path.exists().then_some(path)
[
"../finder/data/online_listings_buy_enriched.parquet",
"../finder/data/online_listings_buy.parquet",
]
.into_iter()
.map(PathBuf::from)
.find(|path| path.exists())
}
#[test]

View file

@ -63,7 +63,20 @@ const DASHBOARD_POI_GROUPS: &[(&str, &[&str])] = &[
("Groceries", GROCERY_DASHBOARD_CATEGORIES),
("Food & Drink", &["Café", "Restaurant", "Pub", "Fast Food"]),
("Green Space", &["Park", "Playground"]),
("Education", &["School"]),
(
"Education",
&[
"Nursery school",
"Primary school",
"Secondary school",
"All-through school",
"Sixth form",
"Further education college",
"University",
"Special school",
"School",
],
),
(
"Health",
&["GP Surgery", "Pharmacy", "Dentist", "Hospital & Clinic"],
@ -119,6 +132,21 @@ fn canonical_poi_category(category: &str) -> &str {
}
}
/// Categories the pipeline emits for the GIAS-derived school POIs. A bare
/// `poi=School` URL (predating the per-phase split) is expanded to all of these
/// so bookmarked links keep showing schools.
const SCHOOL_CATEGORY_ALIASES: &[&str] = &[
"Nursery school",
"Primary school",
"Secondary school",
"All-through school",
"Sixth form",
"Further education college",
"University",
"Special school",
"School",
];
pub fn resolve_poi_category_filter(category_values: &[String], categories: &str) -> FxHashSet<u16> {
let mut selected = FxHashSet::default();
for part in categories.split(',') {
@ -126,6 +154,12 @@ pub fn resolve_poi_category_filter(category_values: &[String], categories: &str)
if category.is_empty() {
continue;
}
if category == "School" {
for alias in SCHOOL_CATEGORY_ALIASES {
add_category_filter_index(category_values, alias, &mut selected);
}
continue;
}
add_category_filter_index(category_values, category, &mut selected);
}
selected
@ -174,6 +208,8 @@ pub struct SchoolMetadata {
pub telephone: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub head_name: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub ofsted_rating: Option<String>,
}
pub struct POIData {
@ -350,6 +386,8 @@ fn build_school_meta(
let website = extract_optional_str_col(df, "school_website")?.unwrap_or_default();
let telephone = extract_optional_str_col(df, "school_telephone")?.unwrap_or_default();
let head_name = extract_optional_str_col(df, "school_head_name")?.unwrap_or_default();
let ofsted_rating =
extract_optional_str_col(df, "school_ofsted_rating")?.unwrap_or_default();
let fetch_str = |col: &Vec<Option<String>>, row: usize| -> Option<String> {
col.get(row).cloned().flatten()
@ -390,6 +428,7 @@ fn build_school_meta(
website: fetch_str(&website, row),
telephone: fetch_str(&telephone, row),
head_name: fetch_str(&head_name, row),
ofsted_rating: fetch_str(&ofsted_rating, row),
});
}
Ok((idx, meta))
@ -578,6 +617,26 @@ mod tests {
assert!(selected.is_empty());
}
#[test]
fn legacy_school_filter_expands_to_all_school_categories() {
// Bookmarked URLs from before the per-phase split sent `poi=School`;
// they should still match every school category that's loaded.
let values = vec![
"Primary school".to_string(),
"Secondary school".to_string(),
"University".to_string(),
"Tesco".to_string(),
];
let selected = resolve_poi_category_filter(&values, "School");
assert!(selected.contains(&0));
assert!(selected.contains(&1));
assert!(selected.contains(&2));
assert!(!selected.contains(&3));
assert_eq!(selected.len(), 3);
}
#[test]
fn coop_category_aliases_resolve_to_single_category() {
let values = vec!["Co-op".to_string(), "Tesco".to_string()];

View file

@ -891,6 +891,15 @@ impl PropertyData {
(&self.postcode_interner, &self.postcode_keys)
}
/// Property rows for a given postcode string, or empty if unknown.
pub fn rows_for_postcode(&self, postcode: &str) -> &[u32] {
self.postcode_interner
.get(postcode)
.and_then(|key| self.postcode_row_index.get(&key))
.map(Vec::as_slice)
.unwrap_or(&[])
}
fn row_address_search_tokens(&self, row: usize) -> &[lasso::Spur] {
let offset = self.address_search_token_offsets[row] as usize;
let length = self.address_search_token_lengths[row] as usize;