i3lock-color/blur_simd.c

/*
 * vim:ts=4:sw=4:expandtab
 *
 * © 2016 Sebastian Frysztak
 *
 * See LICENSE for licensing information
 *
 */

#include "blur.h"
#include <xmmintrin.h>

// number of xmm registers needed to store input pixels for given kernel size
#define REGISTERS_CNT (KERNEL_SIZE + 4/2) / 4
#ifdef __SSE2__
void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, int width, int height) {
    uint32_t* o_src = src;
    for (int row = 0; row < height; row++) {
        for (int column = 0; column < width; column++, src++) {
            __m128i rgbaIn[REGISTERS_CNT];

            // handle borders
            int leftBorder = column < HALF_KERNEL;
            int rightBorder = column > width - HALF_KERNEL;
            uint32_t _rgbaIn[KERNEL_SIZE + 1] __attribute__((aligned(16)));
            int i = 0;
            if (leftBorder) {
                // for kernel size 7x7 and column == 0, we have:
                // x x x P0 P1 P2 P3
                // first loop mirrors P{0..3} to fill x's,
                // second one loads P{0..3}
                for (; i < HALF_KERNEL - column; i++)
                    _rgbaIn[i] = *(src + (HALF_KERNEL - i));
                for (; i < KERNEL_SIZE; i++)
                    _rgbaIn[i] = *(src - (HALF_KERNEL - i));

                for (int k = 0; k < REGISTERS_CNT; k++)
                    rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k));
            } else if (rightBorder) {
                for (; i < width - column; i++)
                    _rgbaIn[i] = *(src + i);
                for (int k = 0; i < KERNEL_SIZE; i++, k++)
                    _rgbaIn[i] = *(src - k);

                for (int k = 0; k < REGISTERS_CNT; k++)
                    rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k));
            } else {
                for (int k = 0; k < REGISTERS_CNT; k++) {
                    if ((uintptr_t) (((__m128i*) src + 4*k - HALF_KERNEL) + 1)
                            > (uintptr_t) (o_src + (height * width)))
                        break;
                    rgbaIn[k] = _mm_loadu_si128((__m128i*)(src + 4*k - HALF_KERNEL));
                }
            }

            __m128i zero = _mm_setzero_si128();
            __m128i acc = _mm_setzero_si128();

            acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[0], zero));
            acc = _mm_add_epi16(acc, _mm_unpackhi_epi8(rgbaIn[0], zero));
            acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[1], zero));

            // kernel size equals to 7, but we can only load multiples of 4 pixels
            // we have to set 8th pixel to zero
            acc = _mm_add_epi16(acc, _mm_andnot_si128(_mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0),
                                                      _mm_unpackhi_epi8(rgbaIn[1], zero)));
            acc = _mm_add_epi32(_mm_unpacklo_epi16(acc, zero),
                                _mm_unpackhi_epi16(acc, zero));

            // multiplication is significantly faster than division
            acc = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(acc),
                                             _mm_set1_ps(1.0/KERNEL_SIZE)));

            *(dst + height * column + row) =
                _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(acc, zero), zero));
        }
    }
}
#endif
Add SSE2-optimized blur. About 4-6 times faster than naive implementation. 2016-10-22 09:30:27 -04:00			`/*`
			`* vim:ts=4:sw=4:expandtab`
			`*`
revert back to better blurring behaviour 2017-12-06 13:57:07 -05:00			`* © 2016 Sebastian Frysztak`
Add SSE2-optimized blur. About 4-6 times faster than naive implementation. 2016-10-22 09:30:27 -04:00			`*`
			`* See LICENSE for licensing information`
			`*`
			`*/`

			`#include "blur.h"`
			`#include <xmmintrin.h>`

revert back to better blurring behaviour 2017-12-06 13:57:07 -05:00			`// number of xmm registers needed to store input pixels for given kernel size`
Extend kernel size to 15x15. 2016-10-28 11:35:33 -04:00			`#define REGISTERS_CNT (KERNEL_SIZE + 4/2) / 4`
fix compiling for 32-bit machines / machine without SSE2 2017-12-07 16:09:13 -05:00			`#ifdef __SSE2__`
revert back to better blurring behaviour 2017-12-06 13:57:07 -05:00			`void blur_impl_horizontal_pass_sse2(uint32_t src, uint32_t dst, int width, int height) {`
blurring stuff should work perfectly fine now 2017-12-05 22:07:38 -05:00			`uint32_t* o_src = src;`
Add SSE2-optimized blur. About 4-6 times faster than naive implementation. 2016-10-22 09:30:27 -04:00			`for (int row = 0; row < height; row++) {`
			`for (int column = 0; column < width; column++, src++) {`
Extend kernel size to 15x15. 2016-10-28 11:35:33 -04:00			`__m128i rgbaIn[REGISTERS_CNT];`
Add SSE2-optimized blur. About 4-6 times faster than naive implementation. 2016-10-22 09:30:27 -04:00
			`// handle borders`
			`int leftBorder = column < HALF_KERNEL;`
Improve border handling for larger kernels. 2016-10-28 11:36:43 -04:00			`int rightBorder = column > width - HALF_KERNEL;`
revert back to better blurring behaviour 2017-12-06 13:57:07 -05:00			`uint32_t _rgbaIn[KERNEL_SIZE + 1] __attribute__((aligned(16)));`
			`int i = 0;`
			`if (leftBorder) {`
			`// for kernel size 7x7 and column == 0, we have:`
			`// x x x P0 P1 P2 P3`
			`// first loop mirrors P{0..3} to fill x's,`
			`// second one loads P{0..3}`
			`for (; i < HALF_KERNEL - column; i++)`
			`_rgbaIn[i] = *(src + (HALF_KERNEL - i));`
			`for (; i < KERNEL_SIZE; i++)`
			`_rgbaIn[i] = *(src - (HALF_KERNEL - i));`
Slightly refactor border handling code. 2016-11-04 17:41:17 -04:00
			`for (int k = 0; k < REGISTERS_CNT; k++)`
			`rgbaIn[k] = _mm_load_si128((__m128i)(_rgbaIn + 4k));`
revert back to better blurring behaviour 2017-12-06 13:57:07 -05:00			`} else if (rightBorder) {`
			`for (; i < width - column; i++)`
			`_rgbaIn[i] = *(src + i);`
			`for (int k = 0; i < KERNEL_SIZE; i++, k++)`
			`_rgbaIn[i] = *(src - k);`
Add SSSE3-based blur implementation. Calculations are done on integer, rather than floating point numbers, so this implementation is not as accurate (but when scale factor is reasonable enough, no artifacs are visible). It is, however, faster by a factor of ~3. 2016-10-29 08:32:49 -04:00
revert back to better blurring behaviour 2017-12-06 13:57:07 -05:00			`for (int k = 0; k < REGISTERS_CNT; k++)`
Extend kernel size to 15x15. 2016-10-28 11:35:33 -04:00			`rgbaIn[k] = _mm_load_si128((__m128i)(_rgbaIn + 4k));`
Add SSE2-optimized blur. About 4-6 times faster than naive implementation. 2016-10-22 09:30:27 -04:00			`} else {`
first commit towards fixing this 2017-12-05 12:53:54 -05:00			`for (int k = 0; k < REGISTERS_CNT; k++) {`
fix compiling for 32-bit machines / machine without SSE2 2017-12-07 16:09:13 -05:00			`if ((uintptr_t) (((__m128i) src + 4k - HALF_KERNEL) + 1)`
			`> (uintptr_t) (o_src + (height * width)))`
blurring stuff should work perfectly fine now 2017-12-05 22:07:38 -05:00			`break;`
Add SSSE3-based blur implementation. Calculations are done on integer, rather than floating point numbers, so this implementation is not as accurate (but when scale factor is reasonable enough, no artifacs are visible). It is, however, faster by a factor of ~3. 2016-10-29 08:32:49 -04:00			`rgbaIn[k] = _mm_loadu_si128((__m128i)(src + 4k - HALF_KERNEL));`
first commit towards fixing this 2017-12-05 12:53:54 -05:00			`}`
Add SSE2-optimized blur. About 4-6 times faster than naive implementation. 2016-10-22 09:30:27 -04:00			`}`

			`__m128i zero = _mm_setzero_si128();`
SSE2: switch from Gaussian to box blur 2016-11-05 11:01:40 -04:00			`__m128i acc = _mm_setzero_si128();`
Add SSE2-optimized blur. About 4-6 times faster than naive implementation. 2016-10-22 09:30:27 -04:00
revert back to better blurring behaviour 2017-12-06 13:57:07 -05:00			`acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[0], zero));`
			`acc = _mm_add_epi16(acc, _mm_unpackhi_epi8(rgbaIn[0], zero));`
			`acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[1], zero));`
Add SSSE3-based blur implementation. Calculations are done on integer, rather than floating point numbers, so this implementation is not as accurate (but when scale factor is reasonable enough, no artifacs are visible). It is, however, faster by a factor of ~3. 2016-10-29 08:32:49 -04:00
revert back to better blurring behaviour 2017-12-06 13:57:07 -05:00			`// kernel size equals to 7, but we can only load multiples of 4 pixels`
			`// we have to set 8th pixel to zero`
			`acc = _mm_add_epi16(acc, _mm_andnot_si128(_mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0),`
			`_mm_unpackhi_epi8(rgbaIn[1], zero)));`
			`acc = _mm_add_epi32(_mm_unpacklo_epi16(acc, zero),`
			`_mm_unpackhi_epi16(acc, zero));`
Add SSSE3-based blur implementation. Calculations are done on integer, rather than floating point numbers, so this implementation is not as accurate (but when scale factor is reasonable enough, no artifacs are visible). It is, however, faster by a factor of ~3. 2016-10-29 08:32:49 -04:00
revert back to better blurring behaviour 2017-12-06 13:57:07 -05:00			`// multiplication is significantly faster than division`
			`acc = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(acc),`
			`_mm_set1_ps(1.0/KERNEL_SIZE)));`
Add SSSE3-based blur implementation. Calculations are done on integer, rather than floating point numbers, so this implementation is not as accurate (but when scale factor is reasonable enough, no artifacs are visible). It is, however, faster by a factor of ~3. 2016-10-29 08:32:49 -04:00
			`(dst + height column + row) =`
revert back to better blurring behaviour 2017-12-06 13:57:07 -05:00			`_mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(acc, zero), zero));`
Add SSE2-optimized blur. About 4-6 times faster than naive implementation. 2016-10-22 09:30:27 -04:00			`}`
			`}`
			`}`
fix compiling for 32-bit machines / machine without SSE2 2017-12-07 16:09:13 -05:00			`#endif`