mirror of
https://github.com/Raymo111/i3lock-color.git
synced 2024-11-11 13:50:52 -05:00
SSE2: resize filter to 7x7. clean up a little.
This commit is contained in:
parent
ab41586b39
commit
b47631d785
1 changed files with 13 additions and 18 deletions
31
blur_simd.c
31
blur_simd.c
|
@ -14,7 +14,7 @@
|
|||
#include <immintrin.h>
|
||||
|
||||
#define ALIGN16 __attribute__((aligned(16)))
|
||||
#define KERNEL_SIZE 15
|
||||
#define KERNEL_SIZE 7
|
||||
#define HALF_KERNEL KERNEL_SIZE / 2
|
||||
|
||||
// number of xmm registers needed to store
|
||||
|
@ -90,30 +90,25 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel,
|
|||
rgbaIn[k] = _mm_loadu_si128((__m128i*)(src + 4*k - HALF_KERNEL));
|
||||
}
|
||||
|
||||
__m128i tmp;
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
__m128i acc = _mm_setzero_si128();
|
||||
|
||||
for (int i = 0; i < 3; i++)
|
||||
{
|
||||
acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[i], zero));
|
||||
acc = _mm_add_epi16(acc, _mm_unpackhi_epi8(rgbaIn[i], zero));
|
||||
}
|
||||
acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[0], zero));
|
||||
acc = _mm_add_epi16(acc, _mm_unpackhi_epi8(rgbaIn[0], zero));
|
||||
acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[1], zero));
|
||||
|
||||
acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[3], zero));
|
||||
|
||||
tmp = _mm_unpackhi_epi8(rgbaIn[3], zero);
|
||||
// set 16th pixel to zeroes
|
||||
tmp = _mm_andnot_si128(_mm_set_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0,0,0,0), tmp);
|
||||
acc = _mm_add_epi16(acc, tmp);
|
||||
acc = _mm_add_epi32(_mm_unpacklo_epi16(acc, zero), _mm_unpackhi_epi16(acc, zero));
|
||||
// kernel size equals to 7, but we can only load multiples of 4 pixels
|
||||
// we have to set 8th pixel to zero
|
||||
acc = _mm_add_epi16(acc, _mm_andnot_si128(_mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0),
|
||||
_mm_unpackhi_epi8(rgbaIn[1], zero)));
|
||||
acc = _mm_add_epi32(_mm_unpacklo_epi16(acc, zero),
|
||||
_mm_unpackhi_epi16(acc, zero));
|
||||
|
||||
acc = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(acc),
|
||||
_mm_set1_ps(1/((float)(KERNEL_SIZE)))));
|
||||
_mm_set1_ps(1/((float)KERNEL_SIZE))));
|
||||
|
||||
acc = _mm_packs_epi32(acc, zero);
|
||||
acc = _mm_packus_epi16(acc, zero);
|
||||
*(dst + height * column + row) = _mm_cvtsi128_si32(acc);
|
||||
*(dst + height * column + row) =
|
||||
_mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(acc, zero), zero));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue