# Faster way for extracting and combining bits from UINT16 to UINT8

Here is an idea. Observe one thing here:

A = A0 OR A1
B = B0 OR B1
C = C0 OR C1
D = D0 OR D1


You have 4 or operations. You can perform all of them in 1 instruction:

PairFlags = (PairFlags | (PairFlags >> 1))


Now you bits are aligned like that:

[D1][D1 or D0][D0 or C1][C1 or C0][C0 or B1][B1 or B0][B0 or A1][A1 or A0]


So you just need to extract bits 0, 2, 4, 6 to get the result.

Bit 1 should be set to bit 2.

Bit 2 should be set to bit 4.

Bit 3 should be set to bit 6.

Final code something like that:

PairFlags = (PairFlags | (PairFlags >> 1))
PairFlags = (PairFlags&1) | ((PairFlags&4)>>1) | ((PairFlags&16)>>2) | ((PairFlags&64)>>3)


Assuming I got everything right (not tested), this seems to generate good, branch-free code at least on gcc and clang for x86 (-O3):

uint8_t convert (uint8_t ChannelFlags)
{
return ( ((ChannelFlags & A1A0)!=0) << A_POS ) |
( ((ChannelFlags & B1B0)!=0) << B_POS ) |
( ((ChannelFlags & C1C0)!=0) << C_POS ) |
( ((ChannelFlags & D1D0)!=0) << D_POS ) ;
}


This masks out each individual bitset, then check against zero to end up with 1 or 0 in a temporary int. This value is shifted in position in the result, before everything is finally bitwise OR:ed together. Full code:

#include <stdint.h>

#define A1A0  (3u << 0)
#define B1B0  (3u << 2)
#define C1C0  (3u << 4)
#define D1D0  (3u << 6)

#define A_POS 0
#define B_POS 1
#define C_POS 2
#define D_POS 3

uint8_t convert (uint8_t ChannelFlags)
{
return ( ((ChannelFlags & A1A0)!=0) << A_POS ) |
( ((ChannelFlags & B1B0)!=0) << B_POS ) |
( ((ChannelFlags & C1C0)!=0) << C_POS ) |
( ((ChannelFlags & D1D0)!=0) << D_POS ) ;
}


clang disassembly x86 gives 18 instructions branch free:

convert:                                # @convert
test    dil, 3
setne   al
test    dil, 12
setne   cl
or      cl, al
test    dil, 48
setne   al
shl     al, 2
or      al, cl
mov     ecx, edi
shr     cl, 7
shr     dil, 6
and     dil, 1
or      dil, cl
shl     dil, 3
or      al, dil
ret


The following should work for reducing a 16-bit value to 8 bits (with each bit of output formed by ORing a pair of bits of input):

// Set even bits to bits in pair ORed together, and odd bits to 0...
PairFlags = (ChannelFlags | (ChannelFlags >> 1)) & 0x5555; // '0h0g0f0e0d0c0b0a'
// Compress the '00' or '01' bit pairs down to single '0' or '1' bits...
PairFlags = (PairFlags ^ (PairFlags >> 1)) & 0x3333; // '00hg00fe00dc00ba'
PairFlags = (PairFlags ^ (PairFlags >> 2)) & 0x0F0F; // '0000hgfe0000dcba'
PairFlags = (PairFlags ^ (PairFlags >> 4)) & 0x00FF; // '00000000hgfedcba'


Note: The ^ can be replaced by | in the above for the same result.

Not sure if more efficient but instead of using a ternary if, why not use only bitwise operations ? And just offset it with the bitshift operator

PairFlags = ((ChannelFlags & (0b1 << 0)) | (ChannelFlags & (0b10 << 0))) << 0;
PairFlags = ((ChannelFlags & (0b1 << 2)) | (ChannelFlags & (0b10 << 2))) << 1;
PairFlags = ((ChannelFlags & (0b1 << 4)) | (ChannelFlags & (0b10 << 4))) << 2;
//...