Generate benchmarks for three implementations of a wait-free queue using C++11 atomics

October 23, 2014 ยท View on GitHub

// // Generate benchmarks for the three queue implementations shown in // Jeff Preshing's CppCon 2014 talk, "How Ubisoft Montreal Develops Games for // Multicore - Before and After C++11". // // Slides: https://github.com/CppCon/CppCon2014/blob/master/Presentations/How%20Ubisoft%20Montreal%20Develops%20Games%20for%20Multicore/How%20Ubisoft%20Montreal%20Develops%20Games%20for%20Multicore%20-%20Before%20and%20After%20C++11%20-%20Jeff%20Preshing%20-%20CppCon%202014.pdf?raw=true //

#include #include #include #include

// 0 : Low-level (relaxed) C++11 atomics with standalone memory fences // 1 : Low-level C++11 atomics with ordering constraints // 2 : Sequentially consistent C++11 atomics #define METHOD 0

using namespace std;

template <class T, int size> class CappedSPSCQueue { private: T m_items[size]; atomic m_writePos; alignas(64) int m_readPos;

public: CappedSPSCQueue() : m_writePos(0), m_readPos(0) {}

void clear()
{
    m_writePos = 0;
    m_readPos = 0;
}

#if METHOD == 0

bool tryPush(const T& item)
{
    int w = m_writePos.load(memory_order_relaxed);
    if (w >= size)
        return false;
    m_items[w] = item;
    atomic_thread_fence(memory_order_release);
    m_writePos.store(w + 1, memory_order_relaxed);
    return true;
}

bool tryPop(T& item)
{
    int w = m_writePos.load(memory_order_relaxed);
    if (m_readPos >= w)
        return false;
    atomic_thread_fence(memory_order_acquire);
    item = m_items[m_readPos];
    m_readPos++;
    return true;
}

#elif METHOD == 1

bool tryPush(const T& item)
{
    int w = m_writePos.load(memory_order_relaxed);
    if (w >= size)
        return false;
    m_items[w] = item;
    m_writePos.store(w + 1, memory_order_release);
    return true;
}

bool tryPop(T& item)
{
    int w = m_writePos.load(memory_order_acquire);
    if (m_readPos >= w)
        return false;
    item = m_items[m_readPos];
    m_readPos++;
    return true;
}

#elif METHOD == 2

bool tryPush(const T& item)
{
    int w = m_writePos;
    if (w >= size)
        return false;
    m_items[w] = item;
    m_writePos = w + 1;
    return true;
}

bool tryPop(T& item)
{
    int w = m_writePos;
    if (m_readPos >= w)
        return false;
    item = m_items[m_readPos];
    m_readPos++;
    return true;
}

#endif };

class Timer { private: chrono::high_resolution_clock::time_point t0, t1; double span;

public: Timer() { span = 0; } void begin() { t0 = chrono::high_resolution_clock::now(); } void end() { t1 = chrono::high_resolution_clock::now(); span += chrono::duration_cast<chrono::duration>(t1 - t0).count(); } double duration() { return span; } void clear() { span = 0; } };

static const int limit = 10000000;

CappedSPSCQueue<int, limit> TestQueue; atomic ready(false); Timer writeTimer, readTimer;

void writer() { ready = true; writeTimer.begin(); for (int i = 0; i < limit; i++) { TestQueue.tryPush(i); } writeTimer.end(); }

void reader() { static volatile int destination; while (!ready) {} readTimer.begin(); for (int i = 0; i < limit; i++) { int y; while (!TestQueue.tryPop((int&) y)) {} // Force the result to be stored somewhere, otherwise // the compiler will optimize y away and not bother // copying the item from the queue at all destination = y; } readTimer.end(); }

#if METHOD < 2 && !defined(arm) static const int iterations = 100; #else static const int iterations = 20; #endif

extern "C" void tests() { printf("METHOD = %d, queue size = %d\n", METHOD, limit);

writeTimer.clear();
readTimer.clear();
for (int i = 0; i < iterations; i++)
{
    TestQueue.clear();
    ready = false;
    
    writer();
    reader();
}
printf("separate: tryPush=%f ns/item, tryPop=%f ns/item\n", writeTimer.duration() / iterations / limit * 10e9, readTimer.duration() / iterations / limit * 10e9);

writeTimer.clear();
readTimer.clear();
for (int i = 0; i < iterations; i++)
{
    TestQueue.clear();
    ready = false;
    
    thread t1(writer);
    reader();
    t1.join();
}
printf("together: tryPush=%f ns/item, tryPop=%f ns/item\n", writeTimer.duration() / iterations / limit * 10e9, readTimer.duration() / iterations / limit * 10e9);

}

int main(int argc, const char * argv[]) { tests(); return 0; }