| // Copyright 2023 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| use criterion::{black_box, criterion_group, criterion_main, Criterion}; |
| use handle_map::*; |
| use std::sync::Arc; |
| use std::thread; |
| use std::time::{Duration, Instant}; |
| |
| // For the sake of these benchmarks, we just want |
| // to be able to allocate an almost-arbitrary number of handles |
| // if needed by the particular benchmark. |
| const MAX_ACTIVE_HANDLES: u32 = u32::MAX - 1; |
| |
| // The default number of threads+shards to use |
| // in multi-threaded benchmarks, chosen |
| // to represent a typical multi-threaded use-case. |
| const DEFAULT_SHARDS_AND_THREADS: u8 = 8; |
| |
| /// We're using u8's for the wrapped object type to deliberately ignore |
| /// costs related to accessing the underlying data - we just care about |
| /// benchmarking handle-map operations. |
| type BenchHandleMap = HandleMap<u8>; |
| |
| fn build_handle_map(num_shards: u8) -> BenchHandleMap { |
| let dimensions = HandleMapDimensions { num_shards, max_active_handles: MAX_ACTIVE_HANDLES }; |
| HandleMap::with_dimensions(dimensions) |
| } |
| |
| /// Benchmark for repeated reads on a single thread |
| fn single_threaded_read_benchmark(c: &mut Criterion) { |
| // The number of shards should not matter much here. |
| let handle_map = build_handle_map(8); |
| // Pre-populate the map with a single handle. |
| let handle = handle_map.allocate(|| 0xFF).unwrap(); |
| let handle_map_ref = &handle_map; |
| // Perform repeated reads |
| c.bench_function("single-threaded reads", |b| { |
| b.iter(|| { |
| let guard = handle_map_ref.get(black_box(handle)).unwrap(); |
| black_box(*guard) |
| }) |
| }); |
| } |
| |
| /// Benchmark for repeated writes on a single thread |
| fn single_threaded_write_benchmark(c: &mut Criterion) { |
| // The number of shards should not matter much here. |
| let handle_map = build_handle_map(8); |
| // Pre-populate the map with a single handle. |
| let handle = handle_map.allocate(|| 0xFF).unwrap(); |
| let handle_map_ref = &handle_map; |
| // Perform repeated writes |
| c.bench_function("single-threaded writes", |b| { |
| b.iter(|| { |
| let mut guard = handle_map_ref.get_mut(black_box(handle)).unwrap(); |
| *guard *= black_box(0); |
| }) |
| }); |
| } |
| |
| /// Benchmark for repeated allocations on a single thread |
| #[allow(unused_must_use)] |
| fn single_threaded_allocate_benchmark(c: &mut Criterion) { |
| // The number of shards here is chosen to be quasi-reasonable. |
| // Realistically, performance will fluctuate somewhat with this |
| // due to the difference in the number of internal hash-map collisions, |
| // but ultimately we only care about the asymptotic behavior. |
| let handle_map = build_handle_map(8); |
| let handle_map_ref = &handle_map; |
| // Perform repeated allocations |
| c.bench_function("single-threaded allocations", |b| { |
| b.iter(|| { |
| handle_map_ref.allocate(|| black_box(0xEE)); |
| }) |
| }); |
| } |
| |
| /// Benchmark for repeated allocation->deallocation on a single thread |
| /// starting from an empty map. |
| #[allow(unused_must_use)] |
| fn single_threaded_allocate_deallocate_near_empty_benchmark(c: &mut Criterion) { |
| // The number of shards should not matter much here. |
| let handle_map = build_handle_map(8); |
| let handle_map_ref = &handle_map; |
| // Perform repeated allocation/deallocation pairs |
| c.bench_function("single-threaded allocate/deallocate pairs (empty init state)", |b| { |
| b.iter(|| { |
| let handle = handle_map_ref.allocate(|| black_box(0xDD)).unwrap(); |
| handle_map_ref.deallocate(handle); |
| }) |
| }); |
| } |
| |
| /// Benchmark for repeated allocation->deallocation starting |
| /// from a map with a thousand elements per shard. |
| #[allow(unused_must_use)] |
| fn single_threaded_allocate_deallocate_one_thousand_benchmark(c: &mut Criterion) { |
| let handle_map = build_handle_map(8); |
| for _ in 0..(8 * 1000) { |
| handle_map.allocate(|| black_box(0xFF)).unwrap(); |
| } |
| let handle_map_ref = &handle_map; |
| |
| // Perform repeated allocation/deallocation pairs |
| c.bench_function( |
| "single-threaded allocate/deallocate pairs (init one thousand elements per shard)", |
| |b| { |
| b.iter(|| { |
| let handle = handle_map_ref.allocate(|| black_box(0xDD)).unwrap(); |
| handle_map_ref.deallocate(handle); |
| }) |
| }, |
| ); |
| } |
| |
| /// Benchmark for repeated allocation->deallocation starting |
| /// from a map with a million elements per shard. |
| #[allow(unused_must_use)] |
| fn single_threaded_allocate_deallocate_one_million_benchmark(c: &mut Criterion) { |
| let handle_map = build_handle_map(8); |
| for _ in 0..(8 * 1000000) { |
| handle_map.allocate(|| black_box(0xFF)).unwrap(); |
| } |
| let handle_map_ref = &handle_map; |
| |
| // Perform repeated allocation/deallocation pairs |
| c.bench_function( |
| "single-threaded allocate/deallocate pairs (init one million elements per shard)", |
| |b| { |
| b.iter(|| { |
| let handle = handle_map_ref.allocate(|| black_box(0xDD)).unwrap(); |
| handle_map_ref.deallocate(handle); |
| }) |
| }, |
| ); |
| } |
| |
| /// Helper function for creating a multi-threaded benchmark |
| /// with the set number of threads. Untimed Initialization of state |
| /// should occur prior to this call. This method will repeatedly |
| /// execute the benchmark code on all threads |
| fn bench_multithreaded<F>(name: &str, num_threads: u8, benchable_fn_ref: Arc<F>, c: &mut Criterion) |
| where |
| F: Fn() + Send + Sync + 'static, |
| { |
| c.bench_function(name, move |b| { |
| b.iter_custom(|iters| { |
| // The returned results from the threads will be their timings |
| let mut join_handles = Vec::new(); |
| for _ in 0..num_threads { |
| let benchable_fn_ref_clone = benchable_fn_ref.clone(); |
| let join_handle = thread::spawn(move || { |
| let start = Instant::now(); |
| for _ in 0..iters { |
| benchable_fn_ref_clone(); |
| } |
| start.elapsed() |
| }); |
| join_handles.push(join_handle); |
| } |
| //Threads spawned, now collect the results |
| let mut total_duration = Duration::from_nanos(0); |
| for join_handle in join_handles { |
| let thread_duration = join_handle.join().unwrap(); |
| total_duration += thread_duration; |
| } |
| total_duration /= num_threads as u32; |
| total_duration |
| }) |
| }); |
| } |
| |
| /// Defines a number of reads/writes to perform [relative |
| /// to other operations performed in a given test] |
| #[derive(Debug, Clone, Copy)] |
| struct ReadWriteCount { |
| num_reads: usize, |
| num_writes: usize, |
| } |
| |
| const READS_ONLY: ReadWriteCount = ReadWriteCount { num_reads: 4, num_writes: 0 }; |
| |
| const READ_LEANING: ReadWriteCount = ReadWriteCount { num_reads: 3, num_writes: 1 }; |
| |
| const BALANCED: ReadWriteCount = ReadWriteCount { num_reads: 2, num_writes: 2 }; |
| |
| const WRITE_LEANING: ReadWriteCount = ReadWriteCount { num_reads: 1, num_writes: 3 }; |
| |
| const WRITES_ONLY: ReadWriteCount = ReadWriteCount { num_reads: 0, num_writes: 4 }; |
| |
| const READ_WRITE_COUNTS: [ReadWriteCount; 5] = |
| [READS_ONLY, READ_LEANING, BALANCED, WRITE_LEANING, WRITES_ONLY]; |
| |
| /// Benchmarks a repeated allocate/[X writes]/[Y reads]/deallocate workflow across |
| /// the default number of threads and shards. |
| fn lifecycle_read_and_write_multithreaded(per_object_rw_count: ReadWriteCount, c: &mut Criterion) { |
| let name = format!( |
| "lifecycle_read_and_write_multithreaded(reads/object: {}, writes/object:{})", |
| per_object_rw_count.num_reads, per_object_rw_count.num_writes |
| ); |
| // We need an Arc to be able to pass this off to `bench_multithreaded`. |
| let handle_map = Arc::new(build_handle_map(DEFAULT_SHARDS_AND_THREADS)); |
| bench_multithreaded( |
| &name, |
| DEFAULT_SHARDS_AND_THREADS, |
| Arc::new(move || { |
| let handle = handle_map.allocate(|| black_box(0xFF)).unwrap(); |
| for _ in 0..per_object_rw_count.num_writes { |
| let mut write_guard = handle_map.get_mut(handle).unwrap(); |
| *write_guard *= black_box(1); |
| } |
| for _ in 0..per_object_rw_count.num_reads { |
| let read_guard = handle_map.get_mut(handle).unwrap(); |
| black_box(*read_guard); |
| } |
| handle_map.deallocate(handle).unwrap(); |
| }), |
| c, |
| ); |
| } |
| |
| fn lifecycle_benchmarks(c: &mut Criterion) { |
| for read_write_count in READ_WRITE_COUNTS { |
| // Using 8 separate shards to roughly match the number of active threads. |
| lifecycle_read_and_write_multithreaded(read_write_count, c); |
| } |
| } |
| |
| /// Benchmarks repeated cycles of accessing the given number of distinct objects, each |
| /// with the given number of reads and writes across the given number of threads. |
| fn read_and_write_contention_multithreaded( |
| num_threads: u8, |
| num_distinct_objects: u32, |
| per_object_rw_count: ReadWriteCount, |
| c: &mut Criterion, |
| ) { |
| let name = format!("read_and_write_contention_multithreaded(threads: {}, objects: {}, reads/object: {}, writes/object:{})", |
| num_threads, num_distinct_objects, per_object_rw_count.num_reads, per_object_rw_count.num_writes); |
| // Default to 8 shards, ultimately doesn't matter much since we only ever access a few shards. |
| // This needs to be `'static` in order to pass a ref off to `bench_multithreaded`. |
| let handle_map = Arc::new(build_handle_map(8)); |
| let mut handles = Vec::new(); |
| for i in 0..num_distinct_objects { |
| let handle = handle_map.allocate(|| i as u8).unwrap(); |
| handles.push(handle); |
| } |
| |
| bench_multithreaded( |
| &name, |
| num_threads, |
| Arc::new(move || { |
| // Deliberately performing all writes first, with |
| // the goal of getting staggered timings from |
| // all of the threads for subsequent iterations. |
| // We also perform reads and writes across all objects |
| // in batches to get something resembling uniform access. |
| for _ in 0..per_object_rw_count.num_writes { |
| for handle in handles.iter() { |
| let mut write_guard = handle_map.get_mut(*handle).unwrap(); |
| *write_guard *= black_box(1); |
| } |
| } |
| for _ in 0..per_object_rw_count.num_reads { |
| for handle in handles.iter() { |
| let read_guard = handle_map.get(*handle).unwrap(); |
| black_box(*read_guard); |
| } |
| } |
| }), |
| c, |
| ); |
| } |
| |
| fn read_write_contention_benchmarks(c: &mut Criterion) { |
| for num_threads in [2, 4, 8].iter() { |
| for num_distinct_objects in [1u32, 2u32, 4u32].iter() { |
| if num_distinct_objects >= &(*num_threads as u32) { |
| continue; |
| } |
| for read_write_count in READ_WRITE_COUNTS { |
| read_and_write_contention_multithreaded( |
| *num_threads, |
| *num_distinct_objects, |
| read_write_count, |
| c, |
| ); |
| } |
| } |
| } |
| } |
| |
| /// Benchmarks allocation (starting from empty) for the default number of shards and threads |
| fn allocator_contention_benchmark(c: &mut Criterion) { |
| let name = "allocator_contention_multithreaded"; |
| // We need an Arc to pass this off to `bench_multithreaded` |
| let handle_map = Arc::new(build_handle_map(DEFAULT_SHARDS_AND_THREADS)); |
| bench_multithreaded( |
| name, |
| DEFAULT_SHARDS_AND_THREADS, |
| Arc::new(move || { |
| handle_map.allocate(|| black_box(0xFF)).unwrap(); |
| }), |
| c, |
| ); |
| } |
| |
| criterion_group!( |
| benches, |
| single_threaded_read_benchmark, |
| single_threaded_write_benchmark, |
| single_threaded_allocate_benchmark, |
| single_threaded_allocate_deallocate_near_empty_benchmark, |
| single_threaded_allocate_deallocate_one_thousand_benchmark, |
| single_threaded_allocate_deallocate_one_million_benchmark, |
| lifecycle_benchmarks, |
| read_write_contention_benchmarks, |
| allocator_contention_benchmark, |
| ); |
| criterion_main!(benches); |