Add SSSE3-based blur implementation.

Calculations are done on integer, rather than floating point numbers, so this implementation is not as accurate (but when scale factor is reasonable enough, no artifacs are visible). It is, however, faster by a factor of ~3.
2016-10-29 14:32:49 +02:00 · 2016-10-29 14:32:49 +02:00 · 72aec87047
parent 3662b8e187
commit 72aec87047
4 changed files with 136 additions and 1 deletions
--- a/1
+++ b/1
@ -14,6 +14,7 @@ endif
 CFLAGS += -std=c99
 CFLAGS += -pipe
 CFLAGS += -Wall
+CFLAGS += -mssse3
 CFLAGS += -O2
 CPPFLAGS += -D_GNU_SOURCE
 CPPFLAGS += -DXKBCOMPOSE=$(shell if test -e /usr/include/xkbcommon/xkbcommon-compose.h ; then echo 1 ; else echo 0 ; fi )
--- a/blur.c
+++ b/blur.c
@ -70,7 +70,8 @@ blur_image_surface (cairo_surface_t *surface, int radius)
    dst_stride = cairo_image_surface_get_stride (tmp);

    //blur_impl_naive(src, dst, width, height, src_stride, dst_stride, 10000);
-    blur_impl_sse2(src, dst, width, height, 4.5);
+    //blur_impl_sse2(src, dst, width, height, 4.5);
+    blur_impl_ssse3(src, dst, width, height, 4.5);

    cairo_surface_destroy (tmp);
    cairo_surface_flush (surface);
--- a/blur.h
+++ b/blur.h
@ -8,6 +8,8 @@ void blur_image_surface (cairo_surface_t *surface, int radius);
 void blur_impl_naive(uint32_t* src, uint32_t* dst, int width, int height, int src_stride, int dst_stride, int radius);
 void blur_impl_sse2(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
 void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height);
+void blur_impl_ssse3(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
+void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int8_t *kernel, int width, int height);

 #endif

--- a/blur_simd.c
+++ b/blur_simd.c
@ -10,6 +10,7 @@
 #include "blur.h"
 #include <math.h>
 #include <xmmintrin.h>
+#include <tmmintrin.h>

 #define ALIGN16 __attribute__((aligned(16)))
 #define KERNEL_SIZE 15 
@ -19,6 +20,11 @@
 // input pixels for given kernel size
 #define REGISTERS_CNT (KERNEL_SIZE + 4/2) / 4

+// scaling factor for kernel coefficients.
+// higher values cause desaturation.
+// used in SSSE3 implementation.
+#define SCALE_FACTOR 7
+
 void blur_impl_sse2(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
    // prepare kernel
    float kernel[KERNEL_SIZE];
@ -117,3 +123,128 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel,
        }
    }
 }
+
+void blur_impl_ssse3(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
+    // prepare kernel
+    float kernelf[KERNEL_SIZE];
+    int8_t kernel[KERNEL_SIZE + 1];
+    float coeff = 1.0 / sqrtf(2 * M_PI * sigma * sigma), sum = 0;
+
+    for (int i = 0; i < KERNEL_SIZE; i++) {
+        float x = HALF_KERNEL - i;
+        kernelf[i] = coeff * expf(-x * x / (2.0 * sigma * sigma));
+        sum += kernelf[i];
+    }
+
+    // normalize kernel
+    for (int i = 0; i < KERNEL_SIZE; i++)
+        kernelf[i] /= sum;
+
+    // round to nearest integer and convert to int
+    for (int i = 0; i < KERNEL_SIZE; i++)
+        kernel[i] = (int8_t)rintf(kernelf[i] * (1 << SCALE_FACTOR));
+    kernel[KERNEL_SIZE] = 0;
+
+    // horizontal pass includes image transposition:
+    // instead of writing pixel src[x] to dst[x],
+    // we write it to transposed location.
+    // (to be exact: dst[height * current_column + current_row])
+    blur_impl_horizontal_pass_ssse3(src, dst, kernel, width, height);
+    blur_impl_horizontal_pass_ssse3(dst, src, kernel, height, width);
+}
+
+
+void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int8_t *kernel, int width, int height) {
+    __m128i _kern = _mm_loadu_si128((__m128i*)kernel);
+    __m128i rgbaIn[REGISTERS_CNT];
+
+    for (int row = 0; row < height; row++) {
+        for (int column = 0; column < width; column++, src++) {
+            uint32_t _rgbaIn[KERNEL_SIZE] ALIGN16;
+            // handle borders
+            int leftBorder = column < HALF_KERNEL;
+            int rightBorder = column > width - HALF_KERNEL;
+            if (leftBorder || rightBorder) {
+                int i = 0;
+                if (leftBorder) {
+                    // for kernel size 7x7 and column == 0, we have:
+                    // x x x P0 P1 P2 P3
+                    // first loop mirrors P{0..3} to fill x's,
+                    // second one loads P{0..3}
+                    for (; i < HALF_KERNEL - column; i++)
+                        _rgbaIn[i] = *(src + (HALF_KERNEL - i));
+                    for (; i < KERNEL_SIZE; i++)
+                        _rgbaIn[i] = *(src - (HALF_KERNEL - i));
+                } else {
+                    for (; i < width - column; i++)
+                        _rgbaIn[i] = *(src + i);
+                    for (int k = 0; i < KERNEL_SIZE; i++, k++)
+                        _rgbaIn[i] = *(src - k);
+                }
+
+                for (int k = 0; k < REGISTERS_CNT; k++)
+                    rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k));
+            } else {
+                for (int k = 0; k < REGISTERS_CNT; k++)
+                    rgbaIn[k] = _mm_loadu_si128((__m128i*)(src + 4*k - HALF_KERNEL));
+            }
+
+            // basis of this implementation is _mm_maddubs_epi16 (aka pmaddubsw).
+            // 'rgba' holds 16 unsigned bytes, so 4 pixels.
+            // 'kern' holds 16 signed bytes kernel values multiplied by (1 << SCALE_FACTOR).
+            // before multiplication takes place, vectors need to be prepared:
+            // 'rgba' is shuffled from R1B1G1A1...R4B4G4A4 to R1R2R3R4...A1A2A3A4
+            // 'kern' is shuffled from w1w2w3w4...w13w14w15w16 to w1w2w3w4 repeated 4 times
+            // then we call _mm_maddubs_epi16 and we get:
+            // --------------------------------------------------------------------------------------
+            // | R1*w1 + R2*w2 | R3*w3 + R4*w4 | G1*w1 + G2*w2 | G3*w3 + G4*w4 | repeat for B and A |
+            // --------------------------------------------------------------------------------------
+            // each 'rectangle' is a 16-byte signed int.
+            // then we repeat the process for the rest of input pixels,
+            // call _mm_hadds_epi16 to add adjacent ints and shift right to scale by SCALE_FACTOR.
+
+            __m128i rgba, kern;
+            __m128i zero = _mm_setzero_si128();
+            __m128i acc = _mm_setzero_si128();
+
+            const __m128i rgba_shuf_mask = _mm_setr_epi8(0, 4, 8,  12,
+                                                         1, 5, 9,  13,
+                                                         2, 6, 10, 14,
+                                                         3, 7, 11, 15);
+
+            const __m128i kern_shuf_mask = _mm_setr_epi8(0, 1, 2, 3,
+                                                         0, 1, 2, 3,
+                                                         0, 1, 2, 3,
+                                                         0, 1, 2, 3);
+
+            rgba = _mm_shuffle_epi8(rgbaIn[0], rgba_shuf_mask);
+            kern = _mm_shuffle_epi8(_kern, kern_shuf_mask);
+            acc = _mm_adds_epi16(acc, _mm_maddubs_epi16(rgba, kern));
+
+            rgba = _mm_shuffle_epi8(rgbaIn[1], rgba_shuf_mask);
+            kern = _mm_shuffle_epi8(_mm_srli_si128(_kern, 4), kern_shuf_mask);
+            acc = _mm_adds_epi16(acc, _mm_maddubs_epi16(rgba, kern));
+
+            rgba = _mm_shuffle_epi8(rgbaIn[2], rgba_shuf_mask);
+            kern = _mm_shuffle_epi8(_mm_srli_si128(_kern, 8), kern_shuf_mask);
+            acc = _mm_adds_epi16(acc, _mm_maddubs_epi16(rgba, kern));
+
+            rgba = _mm_shuffle_epi8(rgbaIn[3], rgba_shuf_mask);
+            kern = _mm_shuffle_epi8(_mm_srli_si128(_kern, 12), kern_shuf_mask);
+            acc = _mm_adds_epi16(acc, _mm_maddubs_epi16(rgba, kern));
+
+            acc = _mm_hadds_epi16(acc, zero);
+            acc = _mm_srai_epi16(acc, SCALE_FACTOR);
+
+            // Cairo sets alpha channel to 255
+            // (or -1, depending how you look at it)
+            // this quickly overflows accumulator,
+            // and alpha is calculated completely wrong.
+            // I assume most people don't use semi-transparent
+            // lock screen images, so no one will mind if we
+            // 'correct it' by setting alpha to 255.
+            *(dst + height * column + row) =
+                _mm_cvtsi128_si32(_mm_packus_epi16(acc, zero)) | 0xFF000000;
+        }
+    }
+}