diff --git a/Dockerfile b/Dockerfile index 7fb2826..f460db4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,6 +39,10 @@ VOLUME ["/app/data"] RUN chown -R appuser:appuser /app USER appuser +# Fallback for any allocations not served by jemalloc (the binary's global +# allocator, tuned via the baked-in malloc_conf): cap glibc to 2 arenas so freed +# memory coalesces and is returned instead of fragmenting across per-CPU arenas. +ENV MALLOC_ARENA_MAX=2 EXPOSE 8001 HEALTHCHECK --interval=30s --timeout=5s --start-period=120s --retries=3 \ CMD curl -f http://localhost:8001/health || exit 1 diff --git a/docker-compose.yml b/docker-compose.yml index f5cc036..e43bca7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -32,6 +32,9 @@ services: - ./property-data:/app/data:ro - ./finder/data:/app/finder-data:ro environment: + # Fallback only — the binary uses jemalloc as its global allocator + # (tuned via a baked-in malloc_conf). Caps glibc to 2 arenas. + MALLOC_ARENA_MAX: "2" POCKETBASE_URL: http://pocketbase:8090 POCKETBASE_ADMIN_EMAIL: *pb-email POCKETBASE_ADMIN_PASSWORD: *pb-password diff --git a/server-rs/Cargo.lock b/server-rs/Cargo.lock index 51b98b0..4aa3069 100644 --- a/server-rs/Cargo.lock +++ b/server-rs/Cargo.lock @@ -3880,6 +3880,7 @@ version = "0.1.0" dependencies = [ "anyhow", "axum", + "bytes", "clap", "h3o", "hex", @@ -3900,6 +3901,7 @@ dependencies = [ "serde", "serde_json", "sha2 0.11.0", + "tikv-jemallocator", "tokio", "tower", "tower-http", @@ -5253,6 +5255,26 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "tikv-jemalloc-sys" +version = "0.6.1+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd8aa5b2ab86a2cefa406d889139c162cbb230092f7d1d7cbc1716405d852a3b" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "tikv-jemallocator" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0359b4327f954e0567e69fb191cf1436617748813819c94b8cd4a431422d053a" +dependencies = [ + "libc", + "tikv-jemalloc-sys", +] + [[package]] name = "tilejson" version = "0.4.3" diff --git a/server-rs/Cargo.toml b/server-rs/Cargo.toml index 7a718cf..bb6b3b4 100644 --- a/server-rs/Cargo.toml +++ b/server-rs/Cargo.toml @@ -33,8 +33,15 @@ sha2 = "0.11" hex = "0.4" tower = { version = "0.5", features = ["limit"] } libc = "0.2" +bytes = "1" sentry = { version = "0.46.0", default-features = false, features = ["backtrace", "contexts", "debug-images", "panic", "reqwest", "rustls", "tracing", "tower-http", "tower-axum-matched-path"] } +# jemalloc returns freed memory to the OS far more aggressively than glibc malloc +# (which strands freed rayon/Polars load buffers across many per-CPU arenas, inflating +# steady-state RSS). Decay is configured via `malloc_conf` in main.rs. +[target.'cfg(not(target_env = "msvc"))'.dependencies] +tikv-jemallocator = { version = "0.6", features = ["unprefixed_malloc_on_supported_platforms"] } + [lints.clippy] min_ident_chars = "warn" diff --git a/server-rs/src/main.rs b/server-rs/src/main.rs index 4b5f318..ac2275c 100644 --- a/server-rs/src/main.rs +++ b/server-rs/src/main.rs @@ -19,6 +19,25 @@ mod routes; mod state; pub mod utils; +// Use jemalloc as the global allocator. glibc malloc keeps freed memory in many +// per-CPU arenas and rarely returns it to the OS, so the transient buffers from +// the rayon/Polars-parallel data load stay resident and inflate steady-state RSS. +// jemalloc's decay-based purging (configured below) hands those pages back. +#[cfg(not(target_env = "msvc"))] +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +// Return dirty/muzzy pages to the OS ~1s after they go idle, and run a background +// thread to do so proactively (so RSS drops after the startup load peak without +// waiting for the next allocation). Read by jemalloc at startup via the +// `malloc_conf` symbol (unprefixed on Linux). Can be overridden by the +// `_RJEM_MALLOC_CONF` / `MALLOC_CONF` env var. +#[cfg(not(target_env = "msvc"))] +#[allow(non_upper_case_globals)] +#[used] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"background_thread:true,dirty_decay_ms:1000,muzzy_decay_ms:1000\0"; + use std::path::{Path, PathBuf}; use std::sync::Arc; use std::time::Duration; diff --git a/server-rs/src/routes/tiles.rs b/server-rs/src/routes/tiles.rs index b9efefd..bd79b5b 100644 --- a/server-rs/src/routes/tiles.rs +++ b/server-rs/src/routes/tiles.rs @@ -1,13 +1,64 @@ +use std::os::unix::fs::FileExt; use std::sync::Arc; use axum::extract::{Path, Query, State}; use axum::http::{header, StatusCode}; use axum::response::{IntoResponse, Response}; -use pmtiles::{AsyncPmTilesReader, MmapBackend, TileCoord}; +use bytes::Bytes; +use pmtiles::{ + AsyncBackend, AsyncPmTilesReader, BackendResponse, HashMapCache, PmtError, PmtResult, TileCoord, +}; use serde::Deserialize; use tracing::warn; -pub type TileReader = AsyncPmTilesReader; +/// PMTiles archives are read straight from disk with positional `pread` calls +/// instead of being memory-mapped. With an mmap backend every touched tile page +/// is attributed to the process RSS (~21 GB across all tilesets); with `pread` +/// the file contents stay in the (reclaimable) kernel page cache and only the +/// small per-request buffer lives in the process. A [`HashMapCache`] keeps the +/// archive's directory entries in memory so tile lookups don't re-read and +/// re-decompress directory pages from disk on every request. +pub type TileReader = AsyncPmTilesReader; + +/// An [`AsyncBackend`] that serves byte ranges from a local file via `pread`. +pub struct FileBackend { + file: Arc, + len: u64, +} + +impl FileBackend { + pub fn open(path: &std::path::Path) -> std::io::Result { + let file = std::fs::File::open(path)?; + let len = file.metadata()?.len(); + Ok(Self { + file: Arc::new(file), + len, + }) + } +} + +impl AsyncBackend for FileBackend { + async fn read(&self, offset: usize, length: usize) -> PmtResult { + let available = (self.len as usize).saturating_sub(offset); + let read_len = length.min(available); + if read_len == 0 { + return Ok(BackendResponse::new(Bytes::new())); + } + + let file = Arc::clone(&self.file); + // pread is blocking; keep it off the async runtime's worker threads. + let buf = tokio::task::spawn_blocking(move || { + let mut buf = vec![0u8; read_len]; + file.read_exact_at(&mut buf, offset as u64)?; + std::io::Result::Ok(buf) + }) + .await + .map_err(|err| PmtError::Reading(std::io::Error::other(err)))? + .map_err(PmtError::Reading)?; + + Ok(BackendResponse::new(Bytes::from(buf))) + } +} pub async fn get_tile( State(reader): State>, @@ -260,7 +311,7 @@ fn build_style(is_dark: bool, layers: &[serde_json::Value], tile_url: &str) -> s } pub async fn init_tile_reader(path: &std::path::Path) -> anyhow::Result { - let backend = MmapBackend::try_from(path).await?; - let reader = AsyncPmTilesReader::try_from_source(backend).await?; + let backend = FileBackend::open(path)?; + let reader = AsyncPmTilesReader::try_from_cached_source(backend, HashMapCache::default()).await?; Ok(reader) }