Notice
Recent Posts
250x250
«   2025/01   »
1 2 3 4
5 6 7 8 9 10 11
12 13 14 15 16 17 18
19 20 21 22 23 24 25
26 27 28 29 30 31
관리 메뉴

일상 코딩

[C++/CPP] 19.06. multi-threading Example inner product 벡터 내적을 통한 멀티 쓰레딩 예제 본문

C++/따배C++ 19강 모던 C++ 기능들

[C++/CPP] 19.06. multi-threading Example inner product 벡터 내적을 통한 멀티 쓰레딩 예제

polarcompass 2021. 12. 16. 07:27
728x90
#include<chrono>
#include<iostream>
#include<mutex>
#include<random>
#include<thread>
#include<utility>
#include<vector>
#include<atomic>
#include<future>
#include<numeric>
#include<algorithm>
#include<execution>

using namespace std;

mutex mtx;

void dotProductNaive(const vector<int> &v0, const vector<int> &v1,
    const unsigned &&i_start, const unsigned &&i_end, unsigned long long &sum)
{
    for (unsigned i = i_start; i < i_end; ++i)
        sum += v0[i] * v1[i];
}

void dotProductLock(const vector<int> &v0, const vector<int> &v1,
    const unsigned i_start, const unsigned i_end, unsigned long long &sum)
{
    // std::scoped_lock lock(mtx); // C++17

    for (unsigned i = i_start; i < i_end; ++i)
    {
        std::scoped_lock lock(mtx); // C++17
        sum += v0[i] * v1[i];
    }
}

void dotProductAtomic(const vector<int> &v0, const vector<int> &v1,
    const unsigned i_start, const unsigned i_end, atomic<unsigned long long> &sum)
{
    // std::scoped_lock lock(mtx); // C++17

    for (unsigned i = i_start; i < i_end; ++i)
    {
        sum += v0[i] * v1[i];
    }
}

auto dotProductFuture(const vector<int> &v0, const vector<int> &v1,
    const unsigned i_start, const unsigned i_end)
{
    int sum = 0; // local sum
    for (unsigned i = i_start; i < i_end; ++i)
    {
        sum += v0[i] * v1[i];
    }
    return sum;
}

int main()
{
    const long long n_data = 100000000;
    const unsigned n_threads = 4;

    //initialize vectors
    std::vector<int> v0, v1;
    v0.reserve(n_data);
    v1.reserve(n_data);

    // random number
    random_device seed;
    mt19937 engine(seed());

    uniform_int_distribution<> uniformDist(1, 10);

    for (long long i = 0; i < n_data; ++i)
    {
        v0.push_back(uniformDist(engine));
        v1.push_back(uniformDist(engine));
    }

    std::cout << "std:;inner_product" << endl;
    {
        const auto sta = chrono::steady_clock::now();
        const auto sum = std::inner_product(v0.begin(), v0.end(), v1.begin(), 0ull);
        const chrono::duration<double> dur = chrono::steady_clock::now() - sta;

        cout << dur.count() << endl;
        cout << sum << endl;
        cout << endl;
    }

    // 답이 안 맞음.
    cout << "Naive" << endl;
    {
        const auto sta = chrono::steady_clock::now();
        unsigned long long sum = 0;

        vector<thread> threads;
        threads.resize(n_threads);

        const unsigned n_per_thread = n_data / n_threads; ;; // assume remainder = 0
        // thread 배정
        for(unsigned t=0; t < n_threads; ++t)
        {
            threads[t] = std::thread(dotProductNaive, std::ref(v0), std::ref(v1), 
                t * n_per_thread, (t+1) * n_per_thread, std::ref(sum));
        }

        // join 배정
        for ( unsigned t = 0; t < n_threads; ++t)
        {
            threads[t].join();
        }
        const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
        cout << dur.count() << endl;
        cout << sum << endl;
        cout << endl;
    }

    cout << "Lock guard" << endl;
    // 여기까진 thread 실패 사례
    {
        const auto sta = chrono::steady_clock::now();
        unsigned long long sum = 0;
        vector<thread> threads;
        threads.resize(n_threads);

        const unsigned n_per_thread = n_data / n_threads; ;; // assume remainder = 0
        // thread 배정
        for(unsigned t=0; t < n_threads; ++t)
        {
            threads[t] = std::thread(dotProductLock, std::ref(v0), std::ref(v1), 
                t * n_per_thread, (t+1) * n_per_thread, std::ref(sum));
        }

        // join 배정
        for ( unsigned t = 0; t < n_threads; ++t)
        {
            threads[t].join();
        }
        const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
        cout << dur.count() << endl;
        cout << sum << endl;
        cout << endl;
    }

    cout << "Atomic" << endl;
    // 빈번하게 호출되는 sum 변수에 Atomic을 대입하면 
    // 느려지게 된다.
    {
        atomic<unsigned long long> sum = 0;
        vector<thread> threads;
        threads.resize(n_threads);

        const auto sta = chrono::steady_clock::now();
        const unsigned n_per_thread = n_data / n_threads; ;; // assume remainder = 0
        // thread 배정
        for(unsigned t=0; t < n_threads; ++t)
        {
            threads[t] = std::thread(dotProductAtomic, std::ref(v0), std::ref(v1), 
                t * n_per_thread, (t+1) * n_per_thread, std::ref(sum));
        }

        // join 배정
        for ( unsigned t = 0; t < n_threads; ++t)
        {
            threads[t].join();
        }
        const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
        cout << dur.count() << endl;
        cout << sum << endl;
        cout << endl;
    }

    cout << "Task base, future, promise" << endl;
    {
        unsigned long long sum = 0;
        vector<std::future<int>> futures;
        futures.resize(n_threads);

        const auto sta = chrono::steady_clock::now();
        const unsigned n_per_thread = n_data / n_threads; ;; // assume remainder = 0

        // thread 배정
        // 연산 함수에서 local 변수로 sum을 사용하기에
        // 각각의 thread에서 따로 따로 계산 후 
        // 마지막에 get()으로 계산 결과 전부 취합하는 방법을 쓴다.
        for(unsigned t=0; t < n_threads; ++t)
        {
            futures[t] = std::async(dotProductFuture, std::ref(v0), std::ref(v1), 
                t * n_per_thread, (t+1) * n_per_thread);
        }

        // join 배정
        // 계산결과 get()으로 전부 취합.
        for ( unsigned t = 0; t < n_threads; ++t)
        {
            sum += futures[t].get();
        }
        const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
        cout << dur.count() << endl;
        cout << sum << endl;
        cout << endl;
    }

    cout << "std::transform_reduce" << endl;
    {
        const auto sta = chrono::steady_clock::now();

        const auto sum = std::transform_reduce(std::execution::par, v0.begin(), v0.end(), v1.begin(), 0ull);
        const chrono::duration<double> dur = chrono::steady_clock::now() - sta;

        cout << dur.count() << endl;
        cout << sum << endl;
        cout << endl;
    }

    return 0;
}
728x90