Better Together Rust Devs | b902b34 | 2023-07-31 08:21:33 -0700 | [diff] [blame] | 1 | // Copyright 2023 Google LLC |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | use criterion::{black_box, criterion_group, criterion_main, Criterion}; |
| 15 | use handle_map::*; |
| 16 | use std::sync::Arc; |
| 17 | use std::thread; |
| 18 | use std::time::{Duration, Instant}; |
| 19 | |
| 20 | // For the sake of these benchmarks, we just want |
| 21 | // to be able to allocate an almost-arbitrary number of handles |
| 22 | // if needed by the particular benchmark. |
| 23 | const MAX_ACTIVE_HANDLES: u32 = u32::MAX - 1; |
| 24 | |
| 25 | // The default number of threads+shards to use |
| 26 | // in multi-threaded benchmarks, chosen |
| 27 | // to represent a typical multi-threaded use-case. |
| 28 | const DEFAULT_SHARDS_AND_THREADS: u8 = 8; |
| 29 | |
| 30 | /// We're using u8's for the wrapped object type to deliberately ignore |
| 31 | /// costs related to accessing the underlying data - we just care about |
| 32 | /// benchmarking handle-map operations. |
| 33 | type BenchHandleMap = HandleMap<u8>; |
| 34 | |
| 35 | fn build_handle_map(num_shards: u8) -> BenchHandleMap { |
| 36 | let dimensions = HandleMapDimensions { num_shards, max_active_handles: MAX_ACTIVE_HANDLES }; |
| 37 | HandleMap::with_dimensions(dimensions) |
| 38 | } |
| 39 | |
| 40 | /// Benchmark for repeated reads on a single thread |
| 41 | fn single_threaded_read_benchmark(c: &mut Criterion) { |
| 42 | // The number of shards should not matter much here. |
| 43 | let handle_map = build_handle_map(8); |
| 44 | // Pre-populate the map with a single handle. |
| 45 | let handle = handle_map.allocate(|| 0xFF).unwrap(); |
| 46 | let handle_map_ref = &handle_map; |
| 47 | // Perform repeated reads |
| 48 | c.bench_function("single-threaded reads", |b| { |
| 49 | b.iter(|| { |
| 50 | let guard = handle_map_ref.get(black_box(handle)).unwrap(); |
| 51 | black_box(*guard) |
| 52 | }) |
| 53 | }); |
| 54 | } |
| 55 | |
| 56 | /// Benchmark for repeated writes on a single thread |
| 57 | fn single_threaded_write_benchmark(c: &mut Criterion) { |
| 58 | // The number of shards should not matter much here. |
| 59 | let handle_map = build_handle_map(8); |
| 60 | // Pre-populate the map with a single handle. |
| 61 | let handle = handle_map.allocate(|| 0xFF).unwrap(); |
| 62 | let handle_map_ref = &handle_map; |
| 63 | // Perform repeated writes |
| 64 | c.bench_function("single-threaded writes", |b| { |
| 65 | b.iter(|| { |
| 66 | let mut guard = handle_map_ref.get_mut(black_box(handle)).unwrap(); |
| 67 | *guard *= black_box(0); |
| 68 | }) |
| 69 | }); |
| 70 | } |
| 71 | |
| 72 | /// Benchmark for repeated allocations on a single thread |
| 73 | #[allow(unused_must_use)] |
| 74 | fn single_threaded_allocate_benchmark(c: &mut Criterion) { |
| 75 | // The number of shards here is chosen to be quasi-reasonable. |
| 76 | // Realistically, performance will fluctuate somewhat with this |
| 77 | // due to the difference in the number of internal hash-map collisions, |
| 78 | // but ultimately we only care about the asymptotic behavior. |
| 79 | let handle_map = build_handle_map(8); |
| 80 | let handle_map_ref = &handle_map; |
| 81 | // Perform repeated allocations |
| 82 | c.bench_function("single-threaded allocations", |b| { |
| 83 | b.iter(|| { |
| 84 | handle_map_ref.allocate(|| black_box(0xEE)); |
| 85 | }) |
| 86 | }); |
| 87 | } |
| 88 | |
| 89 | /// Benchmark for repeated allocation->deallocation on a single thread |
| 90 | /// starting from an empty map. |
| 91 | #[allow(unused_must_use)] |
| 92 | fn single_threaded_allocate_deallocate_near_empty_benchmark(c: &mut Criterion) { |
| 93 | // The number of shards should not matter much here. |
| 94 | let handle_map = build_handle_map(8); |
| 95 | let handle_map_ref = &handle_map; |
| 96 | // Perform repeated allocation/deallocation pairs |
| 97 | c.bench_function("single-threaded allocate/deallocate pairs (empty init state)", |b| { |
| 98 | b.iter(|| { |
| 99 | let handle = handle_map_ref.allocate(|| black_box(0xDD)).unwrap(); |
| 100 | handle_map_ref.deallocate(handle); |
| 101 | }) |
| 102 | }); |
| 103 | } |
| 104 | |
| 105 | /// Benchmark for repeated allocation->deallocation starting |
| 106 | /// from a map with a thousand elements per shard. |
| 107 | #[allow(unused_must_use)] |
| 108 | fn single_threaded_allocate_deallocate_one_thousand_benchmark(c: &mut Criterion) { |
| 109 | let handle_map = build_handle_map(8); |
| 110 | for _ in 0..(8 * 1000) { |
| 111 | handle_map.allocate(|| black_box(0xFF)).unwrap(); |
| 112 | } |
| 113 | let handle_map_ref = &handle_map; |
| 114 | |
| 115 | // Perform repeated allocation/deallocation pairs |
| 116 | c.bench_function( |
| 117 | "single-threaded allocate/deallocate pairs (init one thousand elements per shard)", |
| 118 | |b| { |
| 119 | b.iter(|| { |
| 120 | let handle = handle_map_ref.allocate(|| black_box(0xDD)).unwrap(); |
| 121 | handle_map_ref.deallocate(handle); |
| 122 | }) |
| 123 | }, |
| 124 | ); |
| 125 | } |
| 126 | |
| 127 | /// Benchmark for repeated allocation->deallocation starting |
| 128 | /// from a map with a million elements per shard. |
| 129 | #[allow(unused_must_use)] |
| 130 | fn single_threaded_allocate_deallocate_one_million_benchmark(c: &mut Criterion) { |
| 131 | let handle_map = build_handle_map(8); |
| 132 | for _ in 0..(8 * 1000000) { |
| 133 | handle_map.allocate(|| black_box(0xFF)).unwrap(); |
| 134 | } |
| 135 | let handle_map_ref = &handle_map; |
| 136 | |
| 137 | // Perform repeated allocation/deallocation pairs |
| 138 | c.bench_function( |
| 139 | "single-threaded allocate/deallocate pairs (init one million elements per shard)", |
| 140 | |b| { |
| 141 | b.iter(|| { |
| 142 | let handle = handle_map_ref.allocate(|| black_box(0xDD)).unwrap(); |
| 143 | handle_map_ref.deallocate(handle); |
| 144 | }) |
| 145 | }, |
| 146 | ); |
| 147 | } |
| 148 | |
| 149 | /// Helper function for creating a multi-threaded benchmark |
| 150 | /// with the set number of threads. Untimed Initialization of state |
| 151 | /// should occur prior to this call. This method will repeatedly |
| 152 | /// execute the benchmark code on all threads |
| 153 | fn bench_multithreaded<F>(name: &str, num_threads: u8, benchable_fn_ref: Arc<F>, c: &mut Criterion) |
| 154 | where |
| 155 | F: Fn() + Send + Sync + 'static, |
| 156 | { |
| 157 | c.bench_function(name, move |b| { |
| 158 | b.iter_custom(|iters| { |
| 159 | // The returned results from the threads will be their timings |
| 160 | let mut join_handles = Vec::new(); |
| 161 | for _ in 0..num_threads { |
| 162 | let benchable_fn_ref_clone = benchable_fn_ref.clone(); |
| 163 | let join_handle = thread::spawn(move || { |
| 164 | let start = Instant::now(); |
| 165 | for _ in 0..iters { |
| 166 | benchable_fn_ref_clone(); |
| 167 | } |
| 168 | start.elapsed() |
| 169 | }); |
| 170 | join_handles.push(join_handle); |
| 171 | } |
| 172 | //Threads spawned, now collect the results |
| 173 | let mut total_duration = Duration::from_nanos(0); |
| 174 | for join_handle in join_handles { |
| 175 | let thread_duration = join_handle.join().unwrap(); |
| 176 | total_duration += thread_duration; |
| 177 | } |
| 178 | total_duration /= num_threads as u32; |
| 179 | total_duration |
| 180 | }) |
| 181 | }); |
| 182 | } |
| 183 | |
| 184 | /// Defines a number of reads/writes to perform [relative |
| 185 | /// to other operations performed in a given test] |
| 186 | #[derive(Debug, Clone, Copy)] |
| 187 | struct ReadWriteCount { |
| 188 | num_reads: usize, |
| 189 | num_writes: usize, |
| 190 | } |
| 191 | |
| 192 | const READS_ONLY: ReadWriteCount = ReadWriteCount { num_reads: 4, num_writes: 0 }; |
| 193 | |
| 194 | const READ_LEANING: ReadWriteCount = ReadWriteCount { num_reads: 3, num_writes: 1 }; |
| 195 | |
| 196 | const BALANCED: ReadWriteCount = ReadWriteCount { num_reads: 2, num_writes: 2 }; |
| 197 | |
| 198 | const WRITE_LEANING: ReadWriteCount = ReadWriteCount { num_reads: 1, num_writes: 3 }; |
| 199 | |
| 200 | const WRITES_ONLY: ReadWriteCount = ReadWriteCount { num_reads: 0, num_writes: 4 }; |
| 201 | |
| 202 | const READ_WRITE_COUNTS: [ReadWriteCount; 5] = |
| 203 | [READS_ONLY, READ_LEANING, BALANCED, WRITE_LEANING, WRITES_ONLY]; |
| 204 | |
| 205 | /// Benchmarks a repeated allocate/[X writes]/[Y reads]/deallocate workflow across |
| 206 | /// the default number of threads and shards. |
| 207 | fn lifecycle_read_and_write_multithreaded(per_object_rw_count: ReadWriteCount, c: &mut Criterion) { |
| 208 | let name = format!( |
| 209 | "lifecycle_read_and_write_multithreaded(reads/object: {}, writes/object:{})", |
| 210 | per_object_rw_count.num_reads, per_object_rw_count.num_writes |
| 211 | ); |
| 212 | // We need an Arc to be able to pass this off to `bench_multithreaded`. |
| 213 | let handle_map = Arc::new(build_handle_map(DEFAULT_SHARDS_AND_THREADS)); |
| 214 | bench_multithreaded( |
| 215 | &name, |
| 216 | DEFAULT_SHARDS_AND_THREADS, |
| 217 | Arc::new(move || { |
| 218 | let handle = handle_map.allocate(|| black_box(0xFF)).unwrap(); |
| 219 | for _ in 0..per_object_rw_count.num_writes { |
| 220 | let mut write_guard = handle_map.get_mut(handle).unwrap(); |
| 221 | *write_guard *= black_box(1); |
| 222 | } |
| 223 | for _ in 0..per_object_rw_count.num_reads { |
| 224 | let read_guard = handle_map.get_mut(handle).unwrap(); |
| 225 | black_box(*read_guard); |
| 226 | } |
| 227 | handle_map.deallocate(handle).unwrap(); |
| 228 | }), |
| 229 | c, |
| 230 | ); |
| 231 | } |
| 232 | |
| 233 | fn lifecycle_benchmarks(c: &mut Criterion) { |
| 234 | for read_write_count in READ_WRITE_COUNTS { |
| 235 | // Using 8 separate shards to roughly match the number of active threads. |
| 236 | lifecycle_read_and_write_multithreaded(read_write_count, c); |
| 237 | } |
| 238 | } |
| 239 | |
| 240 | /// Benchmarks repeated cycles of accessing the given number of distinct objects, each |
| 241 | /// with the given number of reads and writes across the given number of threads. |
| 242 | fn read_and_write_contention_multithreaded( |
| 243 | num_threads: u8, |
| 244 | num_distinct_objects: u32, |
| 245 | per_object_rw_count: ReadWriteCount, |
| 246 | c: &mut Criterion, |
| 247 | ) { |
| 248 | let name = format!("read_and_write_contention_multithreaded(threads: {}, objects: {}, reads/object: {}, writes/object:{})", |
| 249 | num_threads, num_distinct_objects, per_object_rw_count.num_reads, per_object_rw_count.num_writes); |
| 250 | // Default to 8 shards, ultimately doesn't matter much since we only ever access a few shards. |
| 251 | // This needs to be `'static` in order to pass a ref off to `bench_multithreaded`. |
| 252 | let handle_map = Arc::new(build_handle_map(8)); |
| 253 | let mut handles = Vec::new(); |
| 254 | for i in 0..num_distinct_objects { |
| 255 | let handle = handle_map.allocate(|| i as u8).unwrap(); |
| 256 | handles.push(handle); |
| 257 | } |
| 258 | |
| 259 | bench_multithreaded( |
| 260 | &name, |
| 261 | num_threads, |
| 262 | Arc::new(move || { |
| 263 | // Deliberately performing all writes first, with |
| 264 | // the goal of getting staggered timings from |
| 265 | // all of the threads for subsequent iterations. |
| 266 | // We also perform reads and writes across all objects |
| 267 | // in batches to get something resembling uniform access. |
| 268 | for _ in 0..per_object_rw_count.num_writes { |
| 269 | for handle in handles.iter() { |
| 270 | let mut write_guard = handle_map.get_mut(*handle).unwrap(); |
| 271 | *write_guard *= black_box(1); |
| 272 | } |
| 273 | } |
| 274 | for _ in 0..per_object_rw_count.num_reads { |
| 275 | for handle in handles.iter() { |
| 276 | let read_guard = handle_map.get(*handle).unwrap(); |
| 277 | black_box(*read_guard); |
| 278 | } |
| 279 | } |
| 280 | }), |
| 281 | c, |
| 282 | ); |
| 283 | } |
| 284 | |
| 285 | fn read_write_contention_benchmarks(c: &mut Criterion) { |
| 286 | for num_threads in [2, 4, 8].iter() { |
| 287 | for num_distinct_objects in [1u32, 2u32, 4u32].iter() { |
| 288 | if num_distinct_objects >= &(*num_threads as u32) { |
| 289 | continue; |
| 290 | } |
| 291 | for read_write_count in READ_WRITE_COUNTS { |
| 292 | read_and_write_contention_multithreaded( |
| 293 | *num_threads, |
| 294 | *num_distinct_objects, |
| 295 | read_write_count, |
| 296 | c, |
| 297 | ); |
| 298 | } |
| 299 | } |
| 300 | } |
| 301 | } |
| 302 | |
| 303 | /// Benchmarks allocation (starting from empty) for the default number of shards and threads |
| 304 | fn allocator_contention_benchmark(c: &mut Criterion) { |
| 305 | let name = "allocator_contention_multithreaded"; |
| 306 | // We need an Arc to pass this off to `bench_multithreaded` |
| 307 | let handle_map = Arc::new(build_handle_map(DEFAULT_SHARDS_AND_THREADS)); |
| 308 | bench_multithreaded( |
| 309 | name, |
| 310 | DEFAULT_SHARDS_AND_THREADS, |
| 311 | Arc::new(move || { |
| 312 | handle_map.allocate(|| black_box(0xFF)).unwrap(); |
| 313 | }), |
| 314 | c, |
| 315 | ); |
| 316 | } |
| 317 | |
| 318 | criterion_group!( |
| 319 | benches, |
| 320 | single_threaded_read_benchmark, |
| 321 | single_threaded_write_benchmark, |
| 322 | single_threaded_allocate_benchmark, |
| 323 | single_threaded_allocate_deallocate_near_empty_benchmark, |
| 324 | single_threaded_allocate_deallocate_one_thousand_benchmark, |
| 325 | single_threaded_allocate_deallocate_one_million_benchmark, |
| 326 | lifecycle_benchmarks, |
| 327 | read_write_contention_benchmarks, |
| 328 | allocator_contention_benchmark, |
| 329 | ); |
| 330 | criterion_main!(benches); |