alright
This commit is contained in:
parent
c645b0f1d4
commit
39ef5c6646
79 changed files with 5660 additions and 2199 deletions
|
|
@ -61,6 +61,9 @@ pub struct ActualListingData {
|
|||
/// overlaid where available. This lets the listings endpoint use the same filter
|
||||
/// execution path as the property endpoints.
|
||||
pub filter_feature_data: Vec<u16>,
|
||||
/// Row-major dynamic postcode POI metrics aligned with
|
||||
/// PropertyData::poi_metrics.feature_names.
|
||||
pub poi_filter_feature_data: Vec<u16>,
|
||||
pub grid: GridIndex,
|
||||
}
|
||||
|
||||
|
|
@ -109,16 +112,16 @@ impl ActualListingData {
|
|||
let listing_status = InternedColumn::build(&opt_to_string(&listing_status_raw));
|
||||
|
||||
let filter_feature_data = build_filter_feature_data(
|
||||
&df,
|
||||
property_data,
|
||||
&postcode,
|
||||
&address,
|
||||
&property_type_raw,
|
||||
&leasehold_freehold_raw,
|
||||
&rooms_total,
|
||||
&floor_area_sqm,
|
||||
&asking_price,
|
||||
&asking_price_per_sqm,
|
||||
);
|
||||
)?;
|
||||
let poi_filter_feature_data = build_poi_filter_feature_data(&df, property_data)?;
|
||||
|
||||
let grid = GridIndex::build(&lat, &lon, GRID_CELL_SIZE);
|
||||
|
||||
|
|
@ -144,6 +147,7 @@ impl ActualListingData {
|
|||
listing_date_iso,
|
||||
features,
|
||||
filter_feature_data,
|
||||
poi_filter_feature_data,
|
||||
grid,
|
||||
})
|
||||
}
|
||||
|
|
@ -174,49 +178,37 @@ impl ActualListingData {
|
|||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn build_filter_feature_data(
|
||||
df: &DataFrame,
|
||||
property_data: Option<&PropertyData>,
|
||||
postcode: &[String],
|
||||
address: &[Option<String>],
|
||||
property_type: &[Option<String>],
|
||||
leasehold_freehold: &[Option<String>],
|
||||
rooms_total: &[Option<i32>],
|
||||
floor_area_sqm: &[Option<f32>],
|
||||
asking_price: &[Option<i64>],
|
||||
asking_price_per_sqm: &[Option<f32>],
|
||||
) -> Vec<u16> {
|
||||
) -> Result<Vec<u16>> {
|
||||
let Some(property_data) = property_data else {
|
||||
return Vec::new();
|
||||
return Ok(Vec::new());
|
||||
};
|
||||
|
||||
let num_features = property_data.num_features;
|
||||
let mut feature_data = vec![NAN_U16; postcode.len() * num_features];
|
||||
let mut joined_rows = 0usize;
|
||||
let row_count = df.height();
|
||||
let mut feature_data = vec![NAN_U16; row_count * num_features];
|
||||
let quant = property_data.quant_ref();
|
||||
let mut encoded_columns = 0usize;
|
||||
|
||||
for (row, postcode_value) in postcode.iter().enumerate() {
|
||||
let Some(address_value) = address[row]
|
||||
.as_deref()
|
||||
.map(str::trim)
|
||||
.filter(|v| !v.is_empty())
|
||||
else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let query = format!("{address_value} {postcode_value}");
|
||||
let Some(&property_row) = property_data.search_addresses(&query, 1).first() else {
|
||||
continue;
|
||||
};
|
||||
if property_data.postcode(property_row) != postcode_value {
|
||||
continue;
|
||||
for (feat_idx, name) in property_data.feature_names.iter().enumerate() {
|
||||
if feat_idx < property_data.num_numeric {
|
||||
if let Some(values) = extract_optional_feature_f32(df, name)? {
|
||||
encode_numeric_feature(&mut feature_data, property_data, &quant, feat_idx, values);
|
||||
encoded_columns += 1;
|
||||
}
|
||||
} else if let Some(values) = extract_optional_feature_str(df, name)? {
|
||||
encode_enum_feature(&mut feature_data, property_data, feat_idx, values);
|
||||
encoded_columns += 1;
|
||||
}
|
||||
|
||||
let dst = row * num_features;
|
||||
let src = property_row * num_features;
|
||||
feature_data[dst..dst + num_features]
|
||||
.copy_from_slice(&property_data.feature_data[src..src + num_features]);
|
||||
joined_rows += 1;
|
||||
}
|
||||
|
||||
let quant = property_data.quant_ref();
|
||||
overlay_numeric_feature(
|
||||
&mut feature_data,
|
||||
property_data,
|
||||
|
|
@ -281,11 +273,50 @@ fn build_filter_feature_data(
|
|||
);
|
||||
|
||||
info!(
|
||||
rows = postcode.len(),
|
||||
joined_rows, "Actual listings joined to property feature matrix"
|
||||
rows = row_count,
|
||||
encoded_columns, "Actual listings feature matrix read from enriched parquet"
|
||||
);
|
||||
|
||||
feature_data
|
||||
Ok(feature_data)
|
||||
}
|
||||
|
||||
fn build_poi_filter_feature_data(
|
||||
df: &DataFrame,
|
||||
property_data: Option<&PropertyData>,
|
||||
) -> Result<Vec<u16>> {
|
||||
let Some(property_data) = property_data else {
|
||||
return Ok(Vec::new());
|
||||
};
|
||||
let poi_metrics = &property_data.poi_metrics;
|
||||
let num_features = poi_metrics.num_features();
|
||||
if num_features == 0 {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let row_count = df.height();
|
||||
let mut feature_data = vec![NAN_U16; row_count * num_features];
|
||||
let quant = poi_metrics.quant_ref();
|
||||
let mut encoded_columns = 0usize;
|
||||
|
||||
for (metric_idx, name) in poi_metrics.feature_names.iter().enumerate() {
|
||||
let Some(values) = extract_optional_feature_f32(df, name)? else {
|
||||
continue;
|
||||
};
|
||||
for (row, value) in values.into_iter().enumerate() {
|
||||
let dst = row * num_features + metric_idx;
|
||||
feature_data[dst] = value
|
||||
.map(|value| encode_numeric_value(&quant, metric_idx, value))
|
||||
.unwrap_or(NAN_U16);
|
||||
}
|
||||
encoded_columns += 1;
|
||||
}
|
||||
|
||||
info!(
|
||||
rows = row_count,
|
||||
encoded_columns, "Actual listings POI metrics read from enriched parquet"
|
||||
);
|
||||
|
||||
Ok(feature_data)
|
||||
}
|
||||
|
||||
fn feature_index(property_data: &PropertyData, name: &str) -> Option<usize> {
|
||||
|
|
@ -323,6 +354,53 @@ fn overlay_numeric_feature<I>(
|
|||
}
|
||||
}
|
||||
|
||||
fn encode_numeric_feature<I>(
|
||||
feature_data: &mut [u16],
|
||||
property_data: &PropertyData,
|
||||
quant: &QuantRef<'_>,
|
||||
feat_idx: usize,
|
||||
values: I,
|
||||
) where
|
||||
I: IntoIterator<Item = Option<f32>>,
|
||||
{
|
||||
let num_features = property_data.num_features;
|
||||
for (row, value) in values.into_iter().enumerate() {
|
||||
let dst = row * num_features + feat_idx;
|
||||
feature_data[dst] = value
|
||||
.map(|value| encode_numeric_value(quant, feat_idx, value))
|
||||
.unwrap_or(NAN_U16);
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_optional_feature_f32(df: &DataFrame, name: &str) -> Result<Option<Vec<Option<f32>>>> {
|
||||
let Ok(column) = df.column(name) else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
if matches!(column.dtype(), DataType::Datetime(_, _) | DataType::Date) {
|
||||
let projected = df
|
||||
.clone()
|
||||
.lazy()
|
||||
.select([(col(name).dt().year().cast(DataType::Float32)
|
||||
+ (col(name).dt().month().cast(DataType::Float32) - lit(1.0f32)) / lit(12.0f32))
|
||||
.alias("__feature")])
|
||||
.collect()
|
||||
.with_context(|| format!("Failed to convert datetime feature '{name}'"))?;
|
||||
return Ok(Some(extract_opt_f32(&projected, "__feature")?));
|
||||
}
|
||||
|
||||
let cast = column
|
||||
.cast(&DataType::Float32)
|
||||
.with_context(|| format!("Failed to cast feature '{name}' to Float32"))?;
|
||||
let values = cast
|
||||
.f32()
|
||||
.with_context(|| format!("Feature '{name}' is not Float32"))?
|
||||
.into_iter()
|
||||
.map(|value| value.filter(|v| v.is_finite()))
|
||||
.collect();
|
||||
Ok(Some(values))
|
||||
}
|
||||
|
||||
fn overlay_enum_feature<'a, I>(
|
||||
feature_data: &mut [u16],
|
||||
property_data: &PropertyData,
|
||||
|
|
@ -355,6 +433,46 @@ fn overlay_enum_feature<'a, I>(
|
|||
}
|
||||
}
|
||||
|
||||
fn encode_enum_feature(
|
||||
feature_data: &mut [u16],
|
||||
property_data: &PropertyData,
|
||||
feat_idx: usize,
|
||||
values: Vec<Option<String>>,
|
||||
) {
|
||||
let Some(enum_values) = property_data.enum_values.get(&feat_idx) else {
|
||||
return;
|
||||
};
|
||||
let num_features = property_data.num_features;
|
||||
for (row, value) in values.into_iter().enumerate() {
|
||||
let dst = row * num_features + feat_idx;
|
||||
feature_data[dst] = value
|
||||
.as_deref()
|
||||
.map(str::trim)
|
||||
.filter(|text| !text.is_empty())
|
||||
.and_then(|text| enum_values.iter().position(|candidate| candidate == text))
|
||||
.map(|position| position as u16)
|
||||
.unwrap_or(NAN_U16);
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_optional_feature_str(df: &DataFrame, name: &str) -> Result<Option<Vec<Option<String>>>> {
|
||||
let Ok(column) = df.column(name) else {
|
||||
return Ok(None);
|
||||
};
|
||||
let cast = column
|
||||
.cast(&DataType::String)
|
||||
.with_context(|| format!("Failed to cast feature '{name}' to String"))?;
|
||||
let strings = cast
|
||||
.str()
|
||||
.with_context(|| format!("Feature '{name}' is not a string column"))?;
|
||||
Ok(Some(
|
||||
strings
|
||||
.into_iter()
|
||||
.map(|value| value.and_then(|text| (!text.trim().is_empty()).then(|| text.to_string())))
|
||||
.collect(),
|
||||
))
|
||||
}
|
||||
|
||||
fn encode_numeric_value(quant: &QuantRef<'_>, feat_idx: usize, value: f32) -> u16 {
|
||||
if !value.is_finite() {
|
||||
return NAN_U16;
|
||||
|
|
@ -517,8 +635,13 @@ mod tests {
|
|||
use std::path::PathBuf;
|
||||
|
||||
fn sample_path() -> Option<PathBuf> {
|
||||
let path = PathBuf::from("../finder/data/online_listings_buy.parquet");
|
||||
path.exists().then_some(path)
|
||||
[
|
||||
"../finder/data/online_listings_buy_enriched.parquet",
|
||||
"../finder/data/online_listings_buy.parquet",
|
||||
]
|
||||
.into_iter()
|
||||
.map(PathBuf::from)
|
||||
.find(|path| path.exists())
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
|
|
@ -63,7 +63,20 @@ const DASHBOARD_POI_GROUPS: &[(&str, &[&str])] = &[
|
|||
("Groceries", GROCERY_DASHBOARD_CATEGORIES),
|
||||
("Food & Drink", &["Café", "Restaurant", "Pub", "Fast Food"]),
|
||||
("Green Space", &["Park", "Playground"]),
|
||||
("Education", &["School"]),
|
||||
(
|
||||
"Education",
|
||||
&[
|
||||
"Nursery school",
|
||||
"Primary school",
|
||||
"Secondary school",
|
||||
"All-through school",
|
||||
"Sixth form",
|
||||
"Further education college",
|
||||
"University",
|
||||
"Special school",
|
||||
"School",
|
||||
],
|
||||
),
|
||||
(
|
||||
"Health",
|
||||
&["GP Surgery", "Pharmacy", "Dentist", "Hospital & Clinic"],
|
||||
|
|
@ -119,6 +132,21 @@ fn canonical_poi_category(category: &str) -> &str {
|
|||
}
|
||||
}
|
||||
|
||||
/// Categories the pipeline emits for the GIAS-derived school POIs. A bare
|
||||
/// `poi=School` URL (predating the per-phase split) is expanded to all of these
|
||||
/// so bookmarked links keep showing schools.
|
||||
const SCHOOL_CATEGORY_ALIASES: &[&str] = &[
|
||||
"Nursery school",
|
||||
"Primary school",
|
||||
"Secondary school",
|
||||
"All-through school",
|
||||
"Sixth form",
|
||||
"Further education college",
|
||||
"University",
|
||||
"Special school",
|
||||
"School",
|
||||
];
|
||||
|
||||
pub fn resolve_poi_category_filter(category_values: &[String], categories: &str) -> FxHashSet<u16> {
|
||||
let mut selected = FxHashSet::default();
|
||||
for part in categories.split(',') {
|
||||
|
|
@ -126,6 +154,12 @@ pub fn resolve_poi_category_filter(category_values: &[String], categories: &str)
|
|||
if category.is_empty() {
|
||||
continue;
|
||||
}
|
||||
if category == "School" {
|
||||
for alias in SCHOOL_CATEGORY_ALIASES {
|
||||
add_category_filter_index(category_values, alias, &mut selected);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
add_category_filter_index(category_values, category, &mut selected);
|
||||
}
|
||||
selected
|
||||
|
|
@ -174,6 +208,8 @@ pub struct SchoolMetadata {
|
|||
pub telephone: Option<String>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub head_name: Option<String>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub ofsted_rating: Option<String>,
|
||||
}
|
||||
|
||||
pub struct POIData {
|
||||
|
|
@ -350,6 +386,8 @@ fn build_school_meta(
|
|||
let website = extract_optional_str_col(df, "school_website")?.unwrap_or_default();
|
||||
let telephone = extract_optional_str_col(df, "school_telephone")?.unwrap_or_default();
|
||||
let head_name = extract_optional_str_col(df, "school_head_name")?.unwrap_or_default();
|
||||
let ofsted_rating =
|
||||
extract_optional_str_col(df, "school_ofsted_rating")?.unwrap_or_default();
|
||||
|
||||
let fetch_str = |col: &Vec<Option<String>>, row: usize| -> Option<String> {
|
||||
col.get(row).cloned().flatten()
|
||||
|
|
@ -390,6 +428,7 @@ fn build_school_meta(
|
|||
website: fetch_str(&website, row),
|
||||
telephone: fetch_str(&telephone, row),
|
||||
head_name: fetch_str(&head_name, row),
|
||||
ofsted_rating: fetch_str(&ofsted_rating, row),
|
||||
});
|
||||
}
|
||||
Ok((idx, meta))
|
||||
|
|
@ -578,6 +617,26 @@ mod tests {
|
|||
assert!(selected.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn legacy_school_filter_expands_to_all_school_categories() {
|
||||
// Bookmarked URLs from before the per-phase split sent `poi=School`;
|
||||
// they should still match every school category that's loaded.
|
||||
let values = vec![
|
||||
"Primary school".to_string(),
|
||||
"Secondary school".to_string(),
|
||||
"University".to_string(),
|
||||
"Tesco".to_string(),
|
||||
];
|
||||
|
||||
let selected = resolve_poi_category_filter(&values, "School");
|
||||
|
||||
assert!(selected.contains(&0));
|
||||
assert!(selected.contains(&1));
|
||||
assert!(selected.contains(&2));
|
||||
assert!(!selected.contains(&3));
|
||||
assert_eq!(selected.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn coop_category_aliases_resolve_to_single_category() {
|
||||
let values = vec!["Co-op".to_string(), "Tesco".to_string()];
|
||||
|
|
|
|||
|
|
@ -891,6 +891,15 @@ impl PropertyData {
|
|||
(&self.postcode_interner, &self.postcode_keys)
|
||||
}
|
||||
|
||||
/// Property rows for a given postcode string, or empty if unknown.
|
||||
pub fn rows_for_postcode(&self, postcode: &str) -> &[u32] {
|
||||
self.postcode_interner
|
||||
.get(postcode)
|
||||
.and_then(|key| self.postcode_row_index.get(&key))
|
||||
.map(Vec::as_slice)
|
||||
.unwrap_or(&[])
|
||||
}
|
||||
|
||||
fn row_address_search_tokens(&self, row: usize) -> &[lasso::Spur] {
|
||||
let offset = self.address_search_token_offsets[row] as usize;
|
||||
let length = self.address_search_token_lengths[row] as usize;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue