seems fine
This commit is contained in:
parent
48983e3b4b
commit
7a1696541f
37 changed files with 4999 additions and 1242 deletions
|
|
@ -11,22 +11,127 @@ use crate::utils::InternedColumn;
|
|||
pub struct PlaceData {
|
||||
pub name: Vec<String>,
|
||||
pub name_lower: Vec<String>,
|
||||
pub name_search: Vec<String>,
|
||||
pub place_type: InternedColumn,
|
||||
pub type_rank: Vec<u8>,
|
||||
pub population: Vec<u32>,
|
||||
pub lat: Vec<f32>,
|
||||
pub lon: Vec<f32>,
|
||||
pub city: Vec<Option<String>>,
|
||||
pub travel_destination: Vec<bool>,
|
||||
}
|
||||
|
||||
fn type_rank(place_type: &str) -> u8 {
|
||||
match place_type {
|
||||
"city" => 0,
|
||||
"station" => 1,
|
||||
_ => 2,
|
||||
"town" => 1,
|
||||
"village" => 2,
|
||||
"suburb" | "neighbourhood" | "quarter" | "borough" | "locality" => 3,
|
||||
"station" => 4,
|
||||
"hamlet" | "isolated_dwelling" | "island" => 5,
|
||||
_ => 6,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_travel_destination_type(place_type: &str) -> bool {
|
||||
matches!(place_type, "city" | "station")
|
||||
}
|
||||
|
||||
pub fn normalize_search_text(text: &str) -> String {
|
||||
let mut result = String::with_capacity(text.len());
|
||||
let mut last_was_space = true;
|
||||
|
||||
for ch in text.chars() {
|
||||
if ch == '\'' || ch == '’' || ch == '`' {
|
||||
continue;
|
||||
}
|
||||
|
||||
let lower = ch.to_ascii_lowercase();
|
||||
if lower.is_ascii_alphanumeric() {
|
||||
result.push(lower);
|
||||
last_was_space = false;
|
||||
} else if !last_was_space {
|
||||
result.push(' ');
|
||||
last_was_space = true;
|
||||
}
|
||||
}
|
||||
|
||||
if result.ends_with(' ') {
|
||||
result.pop();
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
fn replace_token(text: &str, from: &str, to: &str) -> Option<String> {
|
||||
let mut changed = false;
|
||||
let replaced: Vec<&str> = text
|
||||
.split_whitespace()
|
||||
.map(|token| {
|
||||
if token == from {
|
||||
changed = true;
|
||||
to
|
||||
} else {
|
||||
token
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
changed.then(|| replaced.join(" "))
|
||||
}
|
||||
|
||||
fn push_alias(aliases: &mut Vec<String>, alias: String) {
|
||||
if !alias.is_empty() && !aliases.iter().any(|existing| existing == &alias) {
|
||||
aliases.push(alias);
|
||||
}
|
||||
}
|
||||
|
||||
fn build_search_text(name: &str, place_type: &str) -> String {
|
||||
let primary = normalize_search_text(name);
|
||||
let mut aliases = vec![primary.clone()];
|
||||
|
||||
if let Some(alias) = replace_token(&primary, "st", "saint") {
|
||||
push_alias(&mut aliases, alias);
|
||||
}
|
||||
if let Some(alias) = replace_token(&primary, "saint", "st") {
|
||||
push_alias(&mut aliases, alias);
|
||||
}
|
||||
|
||||
if place_type == "station" {
|
||||
let suffix_aliases: [(&str, &[&str]); 5] = [
|
||||
(
|
||||
" tube station",
|
||||
&[" underground station", " station", " tube", " underground"],
|
||||
),
|
||||
(
|
||||
" underground station",
|
||||
&[" tube station", " station", " tube", " underground"],
|
||||
),
|
||||
(
|
||||
" railway station",
|
||||
&[" rail station", " station", " railway", " rail"],
|
||||
),
|
||||
(
|
||||
" overground station",
|
||||
&[" station", " overground", " railway station"],
|
||||
),
|
||||
(
|
||||
" elizabeth line station",
|
||||
&[" station", " elizabeth line", " crossrail station"],
|
||||
),
|
||||
];
|
||||
|
||||
for (suffix, replacements) in suffix_aliases {
|
||||
if let Some(stem) = primary.strip_suffix(suffix) {
|
||||
for replacement in replacements {
|
||||
push_alias(&mut aliases, format!("{stem}{replacement}"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
aliases.join(" | ")
|
||||
}
|
||||
|
||||
fn extract_str_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<String>> {
|
||||
let column = df
|
||||
.column(name)
|
||||
|
|
@ -56,6 +161,23 @@ fn extract_f32_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<f32>> {
|
|||
.collect())
|
||||
}
|
||||
|
||||
fn extract_bool_col_or_default(
|
||||
df: &DataFrame,
|
||||
name: &str,
|
||||
default_value: bool,
|
||||
) -> anyhow::Result<Vec<bool>> {
|
||||
let Ok(column) = df.column(name) else {
|
||||
return Ok(vec![default_value; df.height()]);
|
||||
};
|
||||
let bool_column = column
|
||||
.bool()
|
||||
.with_context(|| format!("Column '{name}' is not a boolean column"))?;
|
||||
Ok(bool_column
|
||||
.into_iter()
|
||||
.map(|value| value.unwrap_or(default_value))
|
||||
.collect())
|
||||
}
|
||||
|
||||
impl PlaceData {
|
||||
pub fn load(parquet_path: &Path) -> anyhow::Result<Self> {
|
||||
info!("Loading place data from {:?}...", parquet_path);
|
||||
|
|
@ -80,8 +202,21 @@ impl PlaceData {
|
|||
};
|
||||
|
||||
let name_lower: Vec<String> = name.iter().map(|nm| nm.to_lowercase()).collect();
|
||||
let name_search: Vec<String> = name
|
||||
.iter()
|
||||
.zip(&place_type_raw)
|
||||
.map(|(nm, pt)| build_search_text(nm, pt))
|
||||
.collect();
|
||||
let type_rank_vec: Vec<u8> = place_type_raw.iter().map(|pt| type_rank(pt)).collect();
|
||||
let place_type = InternedColumn::build(&place_type_raw);
|
||||
let travel_destination = if df.column("travel_destination").is_ok() {
|
||||
extract_bool_col_or_default(&df, "travel_destination", true)?
|
||||
} else {
|
||||
place_type_raw
|
||||
.iter()
|
||||
.map(|place_type| is_travel_destination_type(place_type))
|
||||
.collect()
|
||||
};
|
||||
|
||||
// Precompute nearest city for each non-city place
|
||||
let city_indices: Vec<usize> = type_rank_vec
|
||||
|
|
@ -133,12 +268,14 @@ impl PlaceData {
|
|||
Ok(PlaceData {
|
||||
name,
|
||||
name_lower,
|
||||
name_search,
|
||||
place_type,
|
||||
type_rank: type_rank_vec,
|
||||
population,
|
||||
lat,
|
||||
lon,
|
||||
city,
|
||||
travel_destination,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
@ -149,7 +286,23 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn type_rank_ordering() {
|
||||
assert!(type_rank("city") < type_rank("station"));
|
||||
assert!(type_rank("city") < type_rank("town"));
|
||||
assert!(type_rank("town") < type_rank("station"));
|
||||
assert!(type_rank("station") < type_rank("unknown"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn search_text_handles_common_address_variants() {
|
||||
assert!(build_search_text("King's Cross tube station", "station")
|
||||
.contains("kings cross underground"));
|
||||
assert!(build_search_text("St Albans", "city").contains("saint albans"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn travel_destination_types_match_legacy_places() {
|
||||
assert!(is_travel_destination_type("city"));
|
||||
assert!(is_travel_destination_type("station"));
|
||||
assert!(!is_travel_destination_type("town"));
|
||||
assert!(!is_travel_destination_type("suburb"));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue