Built-in mod ('%') vs custom mod function: improve the performance of modulus operation

According to Chandler Carruth's benchmarks at CppCon 2015, the fastest modulo operator (on x86, when compiled with Clang) is:

int fast_mod(const int input, const int ceil) {
    // apply the modulo operator only when needed
    // (i.e. when the input is greater than the ceiling)
    return input >= ceil ? input % ceil : input;
    // NB: the assumption here is that the numbers are positive

I suggest that you watch the whole talk, he goes into more details on why this method is faster than just using % unconditionally.

This will likely be compiler and platform dependent.

But I was interested and on my system you appear to be correct in my benchmarks. However the method from @865719's answer is fastest:

#include <chrono>
#include <iostream>

class Timer
    using clk = std::chrono::steady_clock;
    using microseconds = std::chrono::microseconds;

    clk::time_point tsb;
    clk::time_point tse;


    void clear() { tsb = tse = clk::now(); }
    void start() { tsb = clk::now(); }
    void stop() { tse = clk::now(); }

    friend std::ostream& operator<<(std::ostream& o, const Timer& timer)
        return o << timer.secs();

    // return time difference in seconds
    double secs() const
        if(tse <= tsb)
            return 0.0;
        auto d = std::chrono::duration_cast<microseconds>(tse - tsb);
        return d.count() / 1000000.0;

int mod(int a, int b)
    int tmp=a/b;
    return a-(b*tmp);

int fast_mod(const int input, const int ceil) {
    // apply the modulo operator only when needed
    // (i.e. when the input is greater than the ceiling)
    return input < ceil ? input : input % ceil;
    // NB: the assumption here is that the numbers are positive

int main()
    auto N = 1000000000U;
    unsigned sum = 0;

    Timer timer;

    for(auto times = 0U; times < 3; ++times)
        std::cout << "     run: " << (times + 1) << '\n';

        sum = 0;
        for(decltype(N) n = 0; n < N; ++n)
            sum += n % (N - n);

        std::cout << "       %: " << sum << " " << timer << "s" << '\n';

        sum = 0;
        for(decltype(N) n = 0; n < N; ++n)
            sum += mod(n, N - n);

        std::cout << "     mod: " << sum << " " << timer << "s" << '\n';

        sum = 0;
        for(decltype(N) n = 0; n < N; ++n)
            sum += fast_mod(n, N - n);

        std::cout << "fast_mod: " << sum << " " << timer << "s" << '\n';

Build: GCC 5.1.1 (x86_64)

g++ -std=c++14 -march=native -O3 -g0 ...


     run: 1
       %: 3081207628 5.49396s
     mod: 3081207628 4.30814s
fast_mod: 3081207628 2.51296s
     run: 2
       %: 3081207628 5.5522s
     mod: 3081207628 4.25427s
fast_mod: 3081207628 2.52364s
     run: 3
       %: 3081207628 5.4947s
     mod: 3081207628 4.29646s
fast_mod: 3081207628 2.56916s