Barretenberg: src/barretenberg/common/thread.cpp Source File

#include "thread.hpp"

#include "log.hpp"

#include "throw_or_abort.hpp"

#include <barretenberg/env/hardware_concurrency.hpp>

#include <cstdlib>

#include <string>


#ifndef NO_MULTITHREADING

#include <thread>


namespace {

uint32_t& get_num_cores_ref()

{

    static thread_local const char* val = std::getenv("HARDWARE_CONCURRENCY");

    static thread_local uint32_t cores =

        val != nullptr ? static_cast<uint32_t>(std::stoul(val)) : env_hardware_concurrency();

    return cores;

}

} // namespace

#endif


namespace bb {

// only for testing purposes currently


void set_parallel_for_concurrency([[maybe_unused]] size_t num_cores)

{

#ifdef NO_MULTITHREADING

    throw_or_abort("Cannot set hardware concurrency when multithreading is disabled.");

#else

    get_num_cores_ref() = static_cast<uint32_t>(num_cores);

#endif

}


size_t get_num_cpus()

{

#ifdef NO_MULTITHREADING

    return 1;

#else

    return static_cast<size_t>(get_num_cores_ref());

#endif

}


} // namespace bb


namespace bb {

// 64 core aws r5.

// pippenger run: pippenger_bench/1048576

// coset_fft run: coset_fft_bench_parallel/4194304

// proof run: 2m gate ultraplonk. average of 5.


// pippenger: 179ms

// coset_fft: 54776us

// proof: 11.33s

void parallel_for_omp(size_t num_iterations, const std::function<void(size_t)>& func);


// pippenger: 163ms

// coset_fft: 59993us

// proof: 11.11s

void parallel_for_moody(size_t num_iterations, const std::function<void(size_t)>& func);


// pippenger: 154ms

// coset_fft: 92997us

// proof: 10.84s

void parallel_for_spawning(size_t num_iterations, const std::function<void(size_t)>& func);


// pippenger: 178ms

// coset_fft: 70207us

// proof: 11.55s

void parallel_for_queued(size_t num_iterations, const std::function<void(size_t)>& func);


// pippenger: 152ms

// coset_fft: 56658us

// proof: 11.28s

void parallel_for_atomic_pool(size_t num_iterations, const std::function<void(size_t)>& func);


void parallel_for_mutex_pool(size_t num_iterations, const std::function<void(size_t)>& func);


void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func)

{

#ifdef NO_MULTITHREADING

    for (size_t i = 0; i < num_iterations; ++i) {

        func(i);

    }

#else

#ifdef OMP_MULTITHREADING

    parallel_for_omp(num_iterations, func);

#else

    // parallel_for_spawning(num_iterations, func);

    // parallel_for_moody(num_iterations, func);

    // parallel_for_atomic_pool(num_iterations, func);

    parallel_for_mutex_pool(num_iterations, func);

    // parallel_for_queued(num_iterations, func);

#endif

#endif

}


void parallel_for_range(size_t num_points,

                        const std::function<void(size_t, size_t)>& func,

                        size_t no_multhreading_if_less_or_equal)

{

    if (num_points <= no_multhreading_if_less_or_equal) {

        func(0, num_points);

        return;

    }

    // Get number of cpus we can split into

    const size_t num_cpus = get_num_cpus();


    // Compute the size of a single chunk

    const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1);

    // Parallelize over chunks

    parallel_for(num_cpus, [num_points, chunk_size, &func](size_t chunk_index) {

        // If num_points is small, sometimes we need fewer CPUs

        if (chunk_size * chunk_index > num_points) {

            return;

        }

        // Compute the current chunk size (can differ in case it's the last chunk)

        size_t current_chunk_size = std::min(num_points - (chunk_size * chunk_index), chunk_size);

        if (current_chunk_size == 0) {

            return;

        }

        size_t start = chunk_index * chunk_size;

        size_t end = chunk_index * chunk_size + current_chunk_size;

        func(start, end);

    });

};


void parallel_for_heuristic(size_t num_points,

                            const std::function<void(size_t, size_t, size_t)>& func,

                            size_t heuristic_cost)

{

    // We take the maximum observed parallel_for cost (388 us) and round it up.

    // The goals of these checks is to evade significantly (10x) increasing processing time for small workloads. So we

    // can accept not triggering parallel_for if the workload would become faster by half a millisecond for medium

    // workloads

    constexpr size_t PARALLEL_FOR_COST = 400000;

    // Get number of cpus we can split into

    const size_t num_cpus = get_num_cpus();


    // Compute the size of a single chunk

    const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1);


    // Compute the cost of all operations done by other threads

    const size_t offset_cost = (num_points - chunk_size) * heuristic_cost;


    // If starting parallel for is longer than computing, just compute

    if (offset_cost < PARALLEL_FOR_COST) {

        func(0, num_points, 0);

        return;

    }

    // Parallelize over chunks

    parallel_for(num_cpus, [num_points, chunk_size, &func](size_t chunk_index) {

        // If num_points is small, sometimes we need fewer CPUs

        if (chunk_size * chunk_index > num_points) {

            return;

        }

        // Compute the current chunk size (can differ in case it's the last chunk)

        size_t current_chunk_size = std::min(num_points - (chunk_size * chunk_index), chunk_size);

        if (current_chunk_size == 0) {

            return;

        }

        size_t start = chunk_index * chunk_size;

        size_t end = chunk_index * chunk_size + current_chunk_size;


        func(start, end, chunk_index);

    });

};


MultithreadData calculate_thread_data(size_t num_iterations, size_t min_iterations_per_thread)

{

    size_t num_threads = calculate_num_threads(num_iterations, min_iterations_per_thread);

    const size_t thread_size = num_iterations / num_threads;


    // Cumpute the index bounds for each thread

    std::vector<size_t> start(num_threads);

    std::vector<size_t> end(num_threads);

    for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {

        start[thread_idx] = thread_idx * thread_size;

        end[thread_idx] = (thread_idx == num_threads - 1) ? num_iterations : (thread_idx + 1) * thread_size;

    }


    return MultithreadData{ num_threads, start, end };

}


size_t calculate_num_threads(size_t num_iterations, size_t min_iterations_per_thread)

{

    size_t max_num_threads = get_num_cpus(); // number of available threads

    size_t desired_num_threads = num_iterations / min_iterations_per_thread;

    size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified

    num_threads = num_threads > 0 ? num_threads : 1;                     // ensure num_threads is at least 1

    return num_threads;

}


size_t calculate_num_threads_pow2(size_t num_iterations, size_t min_iterations_per_thread)

{

    size_t max_num_threads = get_num_cpus_pow2(); // number of available threads (power of 2)

    size_t desired_num_threads = num_iterations / min_iterations_per_thread;

    desired_num_threads = static_cast<size_t>(1ULL << numeric::get_msb(desired_num_threads));

    size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified

    num_threads = num_threads > 0 ? num_threads : 1;                     // ensure num_threads is at least 1

    return num_threads;

}


} // namespace bb

log.hpp

env_hardware_concurrency
uint32_t env_hardware_concurrency()
Definition hardware_concurrency.cpp:14

hardware_concurrency.hpp

bb::numeric::get_msb
constexpr T get_msb(const T in)
Definition get_msb.hpp:47

bb
Entry point for Barretenberg command-line interface.
Definition acir_format_getters.cpp:6

bb::parallel_for_mutex_pool
void parallel_for_mutex_pool(size_t num_iterations, const std::function< void(size_t)> &func)
Definition parallel_for_mutex_pool.cpp:134

bb::calculate_thread_data
MultithreadData calculate_thread_data(size_t num_iterations, size_t min_iterations_per_thread)
Calculates number of threads and index bounds for each thread.
Definition thread.cpp:212

bb::parallel_for_queued
void parallel_for_queued(size_t num_iterations, const std::function< void(size_t)> &func)
Definition parallel_for_queued.cpp:19

bb::get_num_cpus_pow2
size_t get_num_cpus_pow2()
Definition thread.hpp:25

bb::get_num_cpus
size_t get_num_cpus()
Definition thread.cpp:33

bb::parallel_for_moody
void parallel_for_moody(size_t num_iterations, const std::function< void(size_t)> &func)

bb::calculate_num_threads
size_t calculate_num_threads(size_t num_iterations, size_t min_iterations_per_thread)
calculates number of threads to create based on minimum iterations per thread
Definition thread.cpp:238

bb::calculate_num_threads_pow2
size_t calculate_num_threads_pow2(size_t num_iterations, size_t min_iterations_per_thread)
calculates number of threads to create based on minimum iterations per thread, guaranteed power of 2
Definition thread.cpp:254

bb::parallel_for_atomic_pool
void parallel_for_atomic_pool(size_t num_iterations, const std::function< void(size_t)> &func)
Definition parallel_for_atomic_pool.cpp:109

bb::parallel_for_heuristic
void parallel_for_heuristic(size_t num_points, const std::function< void(size_t, size_t, size_t)> &func, size_t heuristic_cost)
Split a loop into several loops running in parallel based on operations in 1 iteration.
Definition thread.cpp:171

bb::parallel_for_spawning
void parallel_for_spawning(size_t num_iterations, const std::function< void(size_t)> &func)
Definition parallel_for_spawning.cpp:11

bb::set_parallel_for_concurrency
void set_parallel_for_concurrency(size_t num_cores)
Definition thread.cpp:24

bb::parallel_for
void parallel_for(size_t num_iterations, const std::function< void(size_t)> &func)
Definition thread.cpp:111

bb::parallel_for_omp
void parallel_for_omp(size_t num_iterations, const std::function< void(size_t)> &func)

bb::parallel_for_range
void parallel_for_range(size_t num_points, const std::function< void(size_t, size_t)> &func, size_t no_multhreading_if_less_or_equal)
Split a loop into several loops running in parallel.
Definition thread.cpp:141

std::get
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13

bb::MultithreadData
Definition thread.hpp:98

thread.hpp

throw_or_abort.hpp

throw_or_abort
void throw_or_abort(std::string const &err)
Definition throw_or_abort.hpp:6