Rust things

This commit is contained in:
Andras Schmelczer 2026-05-10 14:55:43 +01:00
parent fc10381692
commit 3debacab4f
30 changed files with 3257 additions and 647 deletions

View file

@ -97,7 +97,7 @@ fn build_search_text(name: &str, place_type: &str) -> String {
}
if place_type == "station" {
let suffix_aliases: [(&str, &[&str]); 5] = [
let suffix_aliases: [(&str, &[&str]); 6] = [
(
" tube station",
&[" underground station", " station", " tube", " underground"],
@ -118,6 +118,7 @@ fn build_search_text(name: &str, place_type: &str) -> String {
" elizabeth line station",
&[" station", " elizabeth line", " crossrail station"],
),
(" dlr station", &[" station", " dlr"]),
];
for (suffix, replacements) in suffix_aliases {
@ -139,10 +140,15 @@ fn extract_str_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<String>> {
let string_column = column
.str()
.with_context(|| format!("Column '{name}' is not a string column"))?;
Ok(string_column
string_column
.into_iter()
.map(|value| value.unwrap_or("").to_string())
.collect())
.enumerate()
.map(|(row, value)| {
value
.map(ToString::to_string)
.with_context(|| format!("Column '{name}' has null at row {row}"))
})
.collect()
}
fn extract_f32_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<f32>> {
@ -155,33 +161,37 @@ fn extract_f32_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<f32>> {
let float_column = cast
.f32()
.with_context(|| format!("Column '{name}' is not a float32 column"))?;
Ok(float_column
float_column
.into_iter()
.map(|value| value.unwrap_or(0.0))
.collect())
.enumerate()
.map(|(row, value)| value.with_context(|| format!("Column '{name}' has null at row {row}")))
.collect()
}
fn extract_bool_col_or_default(
df: &DataFrame,
name: &str,
default_value: bool,
) -> anyhow::Result<Vec<bool>> {
let Ok(column) = df.column(name) else {
return Ok(vec![default_value; df.height()]);
};
fn extract_bool_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<bool>> {
let column = df
.column(name)
.with_context(|| format!("Missing column '{name}' in places data"))?;
let bool_column = column
.bool()
.with_context(|| format!("Column '{name}' is not a boolean column"))?;
Ok(bool_column
bool_column
.into_iter()
.map(|value| value.unwrap_or(default_value))
.collect())
.enumerate()
.map(|(row, value)| value.with_context(|| format!("Column '{name}' has null at row {row}")))
.collect()
}
impl PlaceData {
pub fn load(parquet_path: &Path) -> anyhow::Result<Self> {
super::run_polars_io(|| Self::load_inner(parquet_path))
}
fn load_inner(parquet_path: &Path) -> anyhow::Result<Self> {
info!("Loading place data from {:?}...", parquet_path);
let parquet_path = PlRefPath::try_from_path(parquet_path)
.context("Failed to normalize places parquet path")?;
let df = LazyFrame::scan_parquet(parquet_path, Default::default())
.context("Failed to scan places parquet")?
.collect()
@ -210,7 +220,7 @@ impl PlaceData {
let type_rank_vec: Vec<u8> = place_type_raw.iter().map(|pt| type_rank(pt)).collect();
let place_type = InternedColumn::build(&place_type_raw);
let travel_destination = if df.column("travel_destination").is_ok() {
extract_bool_col_or_default(&df, "travel_destination", true)?
extract_bool_col(&df, "travel_destination")?
} else {
place_type_raw
.iter()
@ -296,6 +306,7 @@ mod tests {
assert!(build_search_text("King's Cross tube station", "station")
.contains("kings cross underground"));
assert!(build_search_text("St Albans", "city").contains("saint albans"));
assert!(build_search_text("Shadwell DLR station", "station").contains("shadwell station"));
}
#[test]