[C++/CPP] 19.06. multi-threading Example inner product 벡터 내적을 통한 멀티 쓰레딩 예제

Notice

Recent Posts

250x250

« 2025/04 »
일	월	화	수	목	금	토
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30

관리 메뉴

일상 코딩

[C++/CPP] 19.06. multi-threading Example inner product 벡터 내적을 통한 멀티 쓰레딩 예제 본문

C++/따배C++ 19강 모던 C++ 기능들

[C++/CPP] 19.06. multi-threading Example inner product 벡터 내적을 통한 멀티 쓰레딩 예제

polarcompass 2021. 12. 16. 07:27

728x90

#include<chrono>
#include<iostream>
#include<mutex>
#include<random>
#include<thread>
#include<utility>
#include<vector>
#include<atomic>
#include<future>
#include<numeric>
#include<algorithm>
#include<execution>

using namespace std;

mutex mtx;

void dotProductNaive(const vector<int> &v0, const vector<int> &v1,
    const unsigned &&i_start, const unsigned &&i_end, unsigned long long &sum)
{
    for (unsigned i = i_start; i < i_end; ++i)
        sum += v0[i] * v1[i];
}

void dotProductLock(const vector<int> &v0, const vector<int> &v1,
    const unsigned i_start, const unsigned i_end, unsigned long long &sum)
{
    // std::scoped_lock lock(mtx); // C++17

    for (unsigned i = i_start; i < i_end; ++i)
    {
        std::scoped_lock lock(mtx); // C++17
        sum += v0[i] * v1[i];
    }
}

void dotProductAtomic(const vector<int> &v0, const vector<int> &v1,
    const unsigned i_start, const unsigned i_end, atomic<unsigned long long> &sum)
{
    // std::scoped_lock lock(mtx); // C++17

    for (unsigned i = i_start; i < i_end; ++i)
    {
        sum += v0[i] * v1[i];
    }
}

auto dotProductFuture(const vector<int> &v0, const vector<int> &v1,
    const unsigned i_start, const unsigned i_end)
{
    int sum = 0; // local sum
    for (unsigned i = i_start; i < i_end; ++i)
    {
        sum += v0[i] * v1[i];
    }
    return sum;
}

int main()
{
    const long long n_data = 100000000;
    const unsigned n_threads = 4;

    //initialize vectors
    std::vector<int> v0, v1;
    v0.reserve(n_data);
    v1.reserve(n_data);

    // random number
    random_device seed;
    mt19937 engine(seed());

    uniform_int_distribution<> uniformDist(1, 10);

    for (long long i = 0; i < n_data; ++i)
    {
        v0.push_back(uniformDist(engine));
        v1.push_back(uniformDist(engine));
    }

    std::cout << "std:;inner_product" << endl;
    {
        const auto sta = chrono::steady_clock::now();
        const auto sum = std::inner_product(v0.begin(), v0.end(), v1.begin(), 0ull);
        const chrono::duration<double> dur = chrono::steady_clock::now() - sta;

        cout << dur.count() << endl;
        cout << sum << endl;
        cout << endl;
    }

    // 답이 안 맞음.
    cout << "Naive" << endl;
    {
        const auto sta = chrono::steady_clock::now();
        unsigned long long sum = 0;

        vector<thread> threads;
        threads.resize(n_threads);

        const unsigned n_per_thread = n_data / n_threads; ;; // assume remainder = 0
        // thread 배정
        for(unsigned t=0; t < n_threads; ++t)
        {
            threads[t] = std::thread(dotProductNaive, std::ref(v0), std::ref(v1), 
                t * n_per_thread, (t+1) * n_per_thread, std::ref(sum));
        }

        // join 배정
        for ( unsigned t = 0; t < n_threads; ++t)
        {
            threads[t].join();
        }
        const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
        cout << dur.count() << endl;
        cout << sum << endl;
        cout << endl;
    }

    cout << "Lock guard" << endl;
    // 여기까진 thread 실패 사례
    {
        const auto sta = chrono::steady_clock::now();
        unsigned long long sum = 0;
        vector<thread> threads;
        threads.resize(n_threads);

        const unsigned n_per_thread = n_data / n_threads; ;; // assume remainder = 0
        // thread 배정
        for(unsigned t=0; t < n_threads; ++t)
        {
            threads[t] = std::thread(dotProductLock, std::ref(v0), std::ref(v1), 
                t * n_per_thread, (t+1) * n_per_thread, std::ref(sum));
        }

        // join 배정
        for ( unsigned t = 0; t < n_threads; ++t)
        {
            threads[t].join();
        }
        const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
        cout << dur.count() << endl;
        cout << sum << endl;
        cout << endl;
    }

    cout << "Atomic" << endl;
    // 빈번하게 호출되는 sum 변수에 Atomic을 대입하면 
    // 느려지게 된다.
    {
        atomic<unsigned long long> sum = 0;
        vector<thread> threads;
        threads.resize(n_threads);

        const auto sta = chrono::steady_clock::now();
        const unsigned n_per_thread = n_data / n_threads; ;; // assume remainder = 0
        // thread 배정
        for(unsigned t=0; t < n_threads; ++t)
        {
            threads[t] = std::thread(dotProductAtomic, std::ref(v0), std::ref(v1), 
                t * n_per_thread, (t+1) * n_per_thread, std::ref(sum));
        }

        // join 배정
        for ( unsigned t = 0; t < n_threads; ++t)
        {
            threads[t].join();
        }
        const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
        cout << dur.count() << endl;
        cout << sum << endl;
        cout << endl;
    }

    cout << "Task base, future, promise" << endl;
    {
        unsigned long long sum = 0;
        vector<std::future<int>> futures;
        futures.resize(n_threads);

        const auto sta = chrono::steady_clock::now();
        const unsigned n_per_thread = n_data / n_threads; ;; // assume remainder = 0

        // thread 배정
        // 연산 함수에서 local 변수로 sum을 사용하기에
        // 각각의 thread에서 따로 따로 계산 후 
        // 마지막에 get()으로 계산 결과 전부 취합하는 방법을 쓴다.
        for(unsigned t=0; t < n_threads; ++t)
        {
            futures[t] = std::async(dotProductFuture, std::ref(v0), std::ref(v1), 
                t * n_per_thread, (t+1) * n_per_thread);
        }

        // join 배정
        // 계산결과 get()으로 전부 취합.
        for ( unsigned t = 0; t < n_threads; ++t)
        {
            sum += futures[t].get();
        }
        const chrono::duration<double> dur = chrono::steady_clock::now() - sta;
        cout << dur.count() << endl;
        cout << sum << endl;
        cout << endl;
    }

    cout << "std::transform_reduce" << endl;
    {
        const auto sta = chrono::steady_clock::now();

        const auto sum = std::transform_reduce(std::execution::par, v0.begin(), v0.end(), v1.begin(), 0ull);
        const chrono::duration<double> dur = chrono::steady_clock::now() - sta;

        cout << dur.count() << endl;
        cout << sum << endl;
        cout << endl;
    }

    return 0;
}

728x90

저작자표시 비영리 변경금지

'C++ > 따배C++ 19강 모던 C++ 기능들' 카테고리의 다른 글

[C++/CPP] 19.08. 자료형 추론 type inferance, auto decltype (0)	2022.01.02
[C++/CPP] 19.07. Perfect forwarding std::forward (0)	2021.12.30
[C++/CPP] 19.05 Task base, async, future, promise 사용법 (0)	2021.12.15
[C++/CPP] 19.04 Race Condition and std::atomic, std::scoped_lock (0)	2021.12.12
[C++/CPP] 19.03 std::thread와 멀티 쓰레딩 기초 (0)	2021.12.12

'C++/따배C++ 19강 모던 C++ 기능들' Related Articles

일상 코딩

일상 코딩

[C++/CPP] 19.06. multi-threading Example inner product 벡터 내적을 통한 멀티 쓰레딩 예제 본문

[C++/CPP] 19.06. multi-threading Example inner product 벡터 내적을 통한 멀티 쓰레딩 예제

'C++ > 따배C++ 19강 모던 C++ 기능들' 카테고리의 다른 글

티스토리툴바

티스토리툴바