Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
thread.hpp
Go to the documentation of this file.
1#pragma once
3#include <atomic>
6#include <functional>
7#include <iostream>
8#include <ranges>
9#include <vector>
10
11namespace bb {
12#ifdef __wasm__
13// Fixed number of workers in WASM environment
14constexpr size_t PARALLEL_FOR_MAX_NESTING = 1;
15#else
16constexpr size_t PARALLEL_FOR_MAX_NESTING = 2;
17#endif
18
19// Useful for programatically benching different thread counts
20// Note this is threadsafe and affects parallel_for's just in that thread if so.
21void set_parallel_for_concurrency(size_t num_cores);
22size_t get_num_cpus();
23
24// For algorithms that need to be divided amongst power of 2 threads.
25inline size_t get_num_cpus_pow2()
26{
27 return static_cast<size_t>(1ULL << numeric::get_msb(get_num_cpus()));
28}
29
37void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func);
38void parallel_for_range(size_t num_points,
39 const std::function<void(size_t, size_t)>& func,
40 size_t no_multhreading_if_less_or_equal = 0);
41
52void parallel_for_heuristic(size_t num_points,
53 const std::function<void(size_t, size_t, size_t)>& func,
54 size_t heuristic_cost);
55
56template <typename Func>
58void parallel_for_heuristic(size_t num_points, const Func& func, size_t heuristic_cost)
59{
61 num_points,
62 [&](size_t start_idx, size_t end_idx, BB_UNUSED size_t chunk_index) {
63 for (size_t i = start_idx; i < end_idx; i++) {
64 func(i);
65 }
66 },
67 heuristic_cost);
68}
69
76template <typename Func, typename Accum>
79 const Accum& initial_accum,
80 const Func& func,
81 size_t heuristic_cost)
82{
83 // thread-safe accumulators
84 std::vector<Accum> accumulators(get_num_cpus(), initial_accum);
86 num_points,
87 [&](size_t start_idx, size_t end_idx, size_t chunk_index) {
88 for (size_t i = start_idx; i < end_idx; i++) {
89 func(i, accumulators[chunk_index]);
90 }
91 },
92 heuristic_cost);
93 return accumulators;
94}
95
96const size_t DEFAULT_MIN_ITERS_PER_THREAD = 1 << 4;
97
100 // index bounds for each thread
101 std::vector<size_t> start;
102 std::vector<size_t> end;
103};
104
113MultithreadData calculate_thread_data(size_t num_iterations,
114 size_t min_iterations_per_thread = DEFAULT_MIN_ITERS_PER_THREAD);
115
126size_t calculate_num_threads(size_t num_iterations, size_t min_iterations_per_thread = DEFAULT_MIN_ITERS_PER_THREAD);
127
135size_t calculate_num_threads_pow2(size_t num_iterations,
136 size_t min_iterations_per_thread = DEFAULT_MIN_ITERS_PER_THREAD);
137
138namespace thread_heuristics {
139// Rough cost of operations (the operation costs are derives in basics_bench and the units are nanoseconds)
140// Field element (16 byte) addition cost
141constexpr size_t FF_ADDITION_COST = 4;
142// Field element (16 byte) multiplication cost
143constexpr size_t FF_MULTIPLICATION_COST = 21;
144// Field element (16 byte) inversion cost
145constexpr size_t FF_INVERSION_COST = 7000;
146// Group element projective addition number
147constexpr size_t GE_ADDITION_COST = 350;
148// Group element projective doubling number
149constexpr size_t GE_DOUBLING_COST = 194;
150// Group element scalar multiplication cost
151constexpr size_t SM_COST = 50000;
152// Field element (16 byte) sequential copy number
153constexpr size_t FF_COPY_COST = 3;
154// Fine default if something looks 'chunky enough that I don't want to calculate'
155constexpr size_t ALWAYS_MULTITHREAD = 100000;
156} // namespace thread_heuristics
157
161 auto range(size_t size, size_t offset = 0) const
162 {
164 return std::views::iota(size_t{ 0 }, size_t{ 0 });
165 }
166 // Calculate base chunk size and remainder
167 size_t chunk_size = size / total_threads;
168 size_t remainder = size % total_threads;
169
170 if (thread_index < remainder) {
171 // Threads with index < remainder get chunk_size + 1 elements
172 size_t start = thread_index * (chunk_size + 1);
173 size_t end = start + chunk_size + 1;
174 return std::views::iota(start + offset, end + offset);
175 }
176 // Threads with index >= remainder get chunk_size elements
177 size_t start = remainder * (chunk_size + 1) + (thread_index - remainder) * chunk_size;
178 size_t end = start + chunk_size;
179 return std::views::iota(start + offset, end + offset);
180 }
181};
182
183template <typename Func>
185void parallel_for(const Func& func)
186{
187 size_t total_threads = get_num_cpus();
188 parallel_for(total_threads, [&](size_t thread_index) {
189 func(ThreadChunk{ .thread_index = thread_index, .total_threads = total_threads });
190 });
191}
192
193} // namespace bb
#define BB_UNUSED
ssize_t offset
Definition engine.cpp:36
constexpr T get_msb(const T in)
Definition get_msb.hpp:47
constexpr size_t FF_COPY_COST
Definition thread.hpp:153
constexpr size_t GE_ADDITION_COST
Definition thread.hpp:147
constexpr size_t GE_DOUBLING_COST
Definition thread.hpp:149
constexpr size_t ALWAYS_MULTITHREAD
Definition thread.hpp:155
constexpr size_t FF_ADDITION_COST
Definition thread.hpp:141
constexpr size_t FF_MULTIPLICATION_COST
Definition thread.hpp:143
constexpr size_t FF_INVERSION_COST
Definition thread.hpp:145
constexpr size_t SM_COST
Definition thread.hpp:151
Entry point for Barretenberg command-line interface.
MultithreadData calculate_thread_data(size_t num_iterations, size_t min_iterations_per_thread)
Calculates number of threads and index bounds for each thread.
Definition thread.cpp:212
const size_t DEFAULT_MIN_ITERS_PER_THREAD
Definition thread.hpp:96
size_t get_num_cpus_pow2()
Definition thread.hpp:25
size_t get_num_cpus()
Definition thread.cpp:33
constexpr size_t PARALLEL_FOR_MAX_NESTING
Definition thread.hpp:16
size_t calculate_num_threads(size_t num_iterations, size_t min_iterations_per_thread)
calculates number of threads to create based on minimum iterations per thread
Definition thread.cpp:238
size_t calculate_num_threads_pow2(size_t num_iterations, size_t min_iterations_per_thread)
calculates number of threads to create based on minimum iterations per thread, guaranteed power of 2
Definition thread.cpp:254
void parallel_for_heuristic(size_t num_points, const std::function< void(size_t, size_t, size_t)> &func, size_t heuristic_cost)
Split a loop into several loops running in parallel based on operations in 1 iteration.
Definition thread.cpp:171
void set_parallel_for_concurrency(size_t num_cores)
Definition thread.cpp:24
void parallel_for(size_t num_iterations, const std::function< void(size_t)> &func)
Definition thread.cpp:111
void parallel_for_range(size_t num_points, const std::function< void(size_t, size_t)> &func, size_t no_multhreading_if_less_or_equal)
Split a loop into several loops running in parallel.
Definition thread.cpp:141
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
std::vector< size_t > end
Definition thread.hpp:102
std::vector< size_t > start
Definition thread.hpp:101
size_t total_threads
Definition thread.hpp:160
size_t thread_index
Definition thread.hpp:159
auto range(size_t size, size_t offset=0) const
Definition thread.hpp:161