backend: gl_common: Use linear interpolation on GPU for blur kernels.

Make use of hardware linear interpolation in a GPU to sample 2 pixels with a single texture access inside the blur shaders by sampling between both pixels based on their relative weight. This is significantly easier for a single dimension as 2D bilinear filtering would raise additional constraints on the kernels (not single zero-entries, no zero-diagonals, ...) which require additional checks with limited improvements. Therfore, only use interpolation along the larger dimension should be a sufficient improvement. Using this will effectively half the number of texture accesses and additions needed for a kernel. E.g. a 1D-pass of the gaussian blur with radius 15 will only need 16 samples instead of 31.
2020-06-07 12:41:32 +02:00 · 2020-06-07 12:41:32 +02:00 · 88b1638487
parent 4b0ff37b36
commit 88b1638487
1 changed files with 55 additions and 12 deletions
--- a/src/backend/gl/gl_common.c
+++ b/src/backend/gl/gl_common.c
@ -996,7 +996,7 @@ void *gl_create_blur_context(backend_t *base, enum blur_method method, void *arg
 		}
 	);
 	static const char *FRAG_SHADER_BLUR_ADD = QUOTE(
-		sum += float(%.7g) * texture2D(tex_src, uv + pixel_norm * vec2(%d, %d));
+		sum += float(%.7g) * texture2D(tex_src, uv + pixel_norm * vec2(%.7g, %.7g));
 	);
 	// clang-format on

@ -1008,25 +1008,68 @@ void *gl_create_blur_context(backend_t *base, enum blur_method method, void *arg
 		// Build shader
 		int width = kern->w, height = kern->h;
 		int nele = width * height;
+		// '%.7g' is at most 14 characters, inserted 3 times
 		size_t body_len = (strlen(shader_add) + 42) * (uint)nele;
 		char *shader_body = ccalloc(body_len, char);
 		char *pc = shader_body;

+		// Make use of the linear interpolation hardware by sampling 2 pixels with
+		// one texture access by sampling between both pixels based on their
+		// relative weight. Easiest done in a single dimension as 2D bilinear
+		// filtering would raise additional constraints on the kernels. Therefore
+		// only use interpolation along the larger dimension.
 		double sum = 0.0;
+		if (width > height) {
+			// use interpolation in x dimension (width)
 			for (int j = 0; j < height; ++j) {
-			for (int k = 0; k < width; ++k) {
-				double val;
-				val = kern->data[j * width + k];
-				if (val == 0) {
+				for (int k = 0; k < width; k += 2) {
+					double val1, val2;
+					val1 = kern->data[j * width + k];
+					val2 = (k + 1 < width)
+					           ? kern->data[j * width + k + 1]
+					           : 0;
+
+					double combined_weight = val1 + val2;
+					if (combined_weight == 0) {
 						continue;
 					}
-				sum += val;
-				pc += snprintf(pc, body_len - (ulong)(pc - shader_body),
-				               FRAG_SHADER_BLUR_ADD, val, k - width / 2,
-				               j - height / 2);
+					sum += combined_weight;
+
+					double offset_x =
+					    k + (val2 / combined_weight) - (width / 2);
+					double offset_y = j - (height / 2);
+					pc += snprintf(
+					    pc, body_len - (ulong)(pc - shader_body),
+					    shader_add, combined_weight, offset_x, offset_y);
 					assert(pc < shader_body + body_len);
 				}
 			}
+		} else {
+			// use interpolation in y dimension (height)
+			for (int j = 0; j < height; j += 2) {
+				for (int k = 0; k < width; ++k) {
+					double val1, val2;
+					val1 = kern->data[j * width + k];
+					val2 = (j + 1 < height)
+					           ? kern->data[(j + 1) * width + k]
+					           : 0;
+
+					double combined_weight = val1 + val2;
+					if (combined_weight == 0) {
+						continue;
+					}
+					sum += combined_weight;
+
+					double offset_x = k - (width / 2);
+					double offset_y =
+					    j + (val2 / combined_weight) - (height / 2);
+					pc += snprintf(
+					    pc, body_len - (ulong)(pc - shader_body),
+					    shader_add, combined_weight, offset_x, offset_y);
+					assert(pc < shader_body + body_len);
+				}
+			}
+		}

 		auto pass = ctx->blur_shader + i;
 		size_t shader_len = strlen(FRAG_SHADER_BLUR) + strlen(extension) +