core: factor out code for estimating the rendering time budget

Add a render_statistics type to encapsulate all the statistics done on rendering times. And use that to estimate the time budget for rendering and frame pacing. Tweak the rolling window utilities a bit so we can reuse one rolling window for multiple statistics. Signed-off-by: Yuxuan Shui <yshuiv7@gmail.com>
2023-06-10 14:08:49 +01:00 · 2023-06-10 14:08:49 +01:00 · 8e1f3c92f5
parent 336cb0917a
commit 8e1f3c92f5
8 changed files with 443 additions and 164 deletions
--- a/src/backend/backend.c
+++ b/src/backend/backend.c
@ -176,6 +176,20 @@ void paint_all_new(session_t *ps, struct managed_win *t) {
 	region_t reg_shadow_clip;
 	pixman_region32_init(&reg_shadow_clip);

+	struct timespec now;
+	clock_gettime(CLOCK_MONOTONIC, &now);
+	auto now_us = (uint64_t)(now.tv_sec * 1000000 + now.tv_nsec / 1000);
+	if (ps->next_render > 0) {
+		log_trace("Render schedule deviation: %ld us (%s) %ld %ld",
+		          labs((int64_t)now_us - (int64_t)ps->next_render),
+		          now_us < ps->next_render ? "early" : "late", now_us,
+		          ps->next_render);
+		ps->last_schedule_delay = 0;
+		if (now_us > ps->next_render) {
+			ps->last_schedule_delay = now_us - ps->next_render;
+		}
+	}
+
 	if (ps->backend_data->ops->prepare) {
 		ps->backend_data->ops->prepare(ps->backend_data, &reg_paint);
 	}
--- a/src/common.h
+++ b/src/common.h
@ -58,6 +58,7 @@
 #include "list.h"
 #include "region.h"
 #include "render.h"
+#include "statistics.h"
 #include "types.h"
 #include "utils.h"
 #include "win_defs.h"
@ -249,13 +250,16 @@ typedef struct session {
 	/// When the currently rendered frame will be displayed.
 	/// 0 means there is no pending frame.
 	uint64_t target_msc;
-	/// When did we render our last frame.
-	uint64_t last_render;
+	/// The delay between when the last frame was scheduled to be rendered, and when
+	/// the render actually started.
+	uint64_t last_schedule_delay;
+	/// When do we want our next frame to start rendering.
+	uint64_t next_render;
 	/// Whether we can perform frame pacing.
 	bool frame_pacing;

-	struct rolling_avg *frame_time;
-	struct rolling_max *render_stats;
+	/// Render statistics
+	struct render_statistics render_stats;

 	// === Operation related ===
 	/// Flags related to the root window
--- a/src/meson.build
+++ b/src/meson.build
@ -9,7 +9,7 @@ base_deps = [

 srcs = [ files('picom.c', 'win.c', 'c2.c', 'x.c', 'config.c', 'vsync.c', 'utils.c',
               'diagnostic.c', 'string_utils.c', 'render.c', 'kernel.c', 'log.c',
-               'options.c', 'event.c', 'cache.c', 'atom.c', 'file_watch.c') ]
+               'options.c', 'event.c', 'cache.c', 'atom.c', 'file_watch.c', 'statistics.c') ]
 picom_inc = include_directories('.')

 cflags = []
--- a/src/picom.c
+++ b/src/picom.c
@ -16,6 +16,7 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <inttypes.h>
+#include <stddef.h>
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>
@ -43,7 +44,6 @@
 #endif
 #include "backend/backend.h"
 #include "c2.h"
-#include "config.h"
 #include "diagnostic.h"
 #include "log.h"
 #include "region.h"
@ -60,6 +60,7 @@
 #include "file_watch.h"
 #include "list.h"
 #include "options.h"
+#include "statistics.h"
 #include "uthash_extra.h"

 /// Get session_t pointer from a pointer to a member of session_t
@ -177,8 +178,17 @@ static inline struct managed_win *find_win_all(session_t *ps, const xcb_window_t
 /// 4. otherwise, we schedule a render for that target frame. we use past statistics about
 ///    how long our renders took to figure out when to start rendering. we start rendering
 ///    at the latest point of time possible to still hit the target frame.
-void schedule_render(session_t *ps) {
+///
+/// The `triggered_by_timer` parameter is used to indicate whether this function is
+/// triggered by a steady timer, i.e. we are rendering for each vblank. The other case is
+/// when we stop rendering for a while because there is no changes on screen, then
+/// something changed and schedule_render is triggered by a DamageNotify. The idea is that
+/// when the schedule is triggered by a steady timer, schedule_render will be called at a
+/// predictable offset into each vblank.
+
+void schedule_render(session_t *ps, bool triggered_by_vblank) {
 	double delay_s = 0;
+	ps->next_render = 0;
 	if (!ps->frame_pacing || !ps->redirected) {
 		// Not doing frame pacing, schedule a render immediately, if not already
 		// scheduled.
@ -205,59 +215,77 @@ void schedule_render(session_t *ps) {
 			// We missed our target, push it back one frame
 			ps->target_msc = ps->last_msc + 1;
 		}
+		log_trace("Still rendering for target frame %ld, not scheduling another "
+		          "render",
+		          ps->target_msc);
 		return;
 	}
 	if (ps->target_msc > ps->last_msc) {
-		// Render for the target frame is completed, and has yet to be displayed.
+		// Render for the target frame is completed, but is yet to be displayed.
 		// Don't schedule another render.
+		log_trace("Target frame %ld is in the future, and we have already "
+		          "rendered, last msc: %d",
+		          ps->target_msc, (int)ps->last_msc);
 		return;
 	}

 	struct timespec now;
 	clock_gettime(CLOCK_MONOTONIC, &now);
 	auto now_us = (uint64_t)now.tv_sec * 1000000 + (uint64_t)now.tv_nsec / 1000;
+	if (triggered_by_vblank) {
+		log_trace("vblank schedule delay: %ld us", now_us - ps->last_msc_instant);
+	}

 	int render_time_us =
 	    (int)(render_time.tv_sec * 1000000L + render_time.tv_nsec / 1000L);
 	if (ps->target_msc == ps->last_msc) {
 		// The frame has just been displayed, record its render time;
-		log_trace("Last render call took: %d us, rolling_max: %d us",
-		          render_time_us, rolling_max_get_max(ps->render_stats));
-		rolling_max_push(ps->render_stats, render_time_us);
+		log_trace("Last render call took: %d (gpu) + %d (cpu) us, "
+		          "last_msc: %" PRIu64,
+		          render_time_us, (int)ps->last_schedule_delay, ps->last_msc);
+		render_statistics_add_render_time_sample(
+		    &ps->render_stats, render_time_us + (int)ps->last_schedule_delay);
 		ps->target_msc = 0;
+		ps->last_schedule_delay = 0;
 	}

-	// TODO(yshui): why 1500?
-	const int SLACK = 1500;
-	int render_time_estimate = rolling_max_get_max(ps->render_stats);
-	auto frame_time = (uint64_t)rolling_avg_get_avg(ps->frame_time);
-	if (render_time_estimate < 0 || frame_time == 0) {
-		// We don't have render time, and/or frame time estimates, maybe there's
+	unsigned int divisor = 0;
+	auto render_budget = render_statistics_get_budget(&ps->render_stats, &divisor);
+	auto frame_time = render_statistics_get_vblank_time(&ps->render_stats);
+	if (frame_time == 0) {
+		// We don't have enough data for render time estimates, maybe there's
 		// no frame rendered yet, or the backend doesn't support render timing
 		// information, schedule render immediately.
 		ps->target_msc = ps->last_msc + 1;
 		goto schedule;
 	}
-	render_time_estimate += SLACK;

-	auto minimal_target_us = now_us + (uint64_t)render_time_estimate;
-	auto frame_delay = (uint64_t)ceil(
-	    (double)(minimal_target_us - ps->last_msc_instant) / (double)frame_time);
-	auto deadline = ps->last_msc_instant + frame_delay * frame_time;
-
-	ps->target_msc = ps->last_msc + frame_delay;
-	auto delay = deadline - (uint64_t)render_time_estimate - now_us;
-	delay_s = (double)delay / 1000000.0;
-	if (delay_s > 1) {
-		log_warn("Delay too long: %f s, render_time: %d us, frame_time: %" PRIu64
-		         " us, now_us: %" PRIu64 " us, next_msc: %" PRIu64 " us",
-		         delay_s, render_time_estimate, frame_time, now_us, deadline);
+	const auto deadline = ps->last_msc_instant + (unsigned long)divisor * frame_time;
+	unsigned int available = 0;
+	if (deadline > now_us) {
+		available = (unsigned int)(deadline - now_us);
 	}

-	log_trace("Delay: %lu us, last_msc: %" PRIu64 ", render_time: %d, frame_time: "
-	          "%" PRIu64 ", now_us: %" PRIu64 ", next_msc: %" PRIu64,
-	          delay, ps->last_msc_instant, render_time_estimate, frame_time, now_us,
-	          deadline);
+	ps->target_msc = ps->last_msc + divisor;
+	if (available > render_budget) {
+		delay_s = (double)(available - render_budget) / 1000000.0;
+		ps->next_render = deadline - render_budget;
+	} else {
+		delay_s = 0;
+		ps->next_render = now_us;
+	}
+	if (delay_s > 1) {
+		log_warn("Delay too long: %f s, render_budget: %d us, frame_time: "
+		         "%" PRIu32 " us, now_us: %" PRIu64 " us, next_msc: %" PRIu64 " u"
+		         "s",
+		         delay_s, render_budget, frame_time, now_us, deadline);
+	}
+
+	log_trace("Delay: %.6lf s, last_msc: %" PRIu64 ", render_budget: %d, frame_time: "
+	          "%" PRIu32 ", now_us: %" PRIu64 ", next_msc: %" PRIu64 ", "
+	          "target_msc: %" PRIu64 ", divisor: %d",
+	          delay_s, ps->last_msc_instant, render_budget, frame_time, now_us,
+	          deadline, ps->target_msc, divisor);

 schedule:
 	assert(!ev_is_active(&ps->draw_timer));
@ -282,7 +310,7 @@ void queue_redraw(session_t *ps) {
 	// If frame pacing is not enabled, pretend this is false.
 	// If --benchmark is used, redraw is always queued
 	if (!ps->redraw_needed && !ps->o.benchmark) {
-		schedule_render(ps);
+		schedule_render(ps, false);
 	}
 	ps->redraw_needed = true;
 }
@ -758,7 +786,6 @@ static void configure_root(session_t *ps) {
 		}
 		force_repaint(ps);
 	}
-	return;
 }

 static void handle_root_flags(session_t *ps) {
@ -1439,10 +1466,9 @@ static bool redirect_start(session_t *ps) {
 		// states.
 		ps->last_msc_instant = 0;
 		ps->last_msc = 0;
-		ps->last_render = 0;
+		ps->last_schedule_delay = 0;
 		ps->target_msc = 0;
-		rolling_avg_reset(ps->frame_time);
-		rolling_max_reset(ps->render_stats);
+		render_statistics_reset(&ps->render_stats);
 	} else if (ps->frame_pacing) {
 		log_error("Present extension is not supported, frame pacing disabled.");
 		ps->frame_pacing = false;
@ -1543,11 +1569,12 @@ handle_present_complete_notify(session_t *ps, xcb_present_complete_notify_event_
 	if (ps->last_msc_instant != 0) {
 		auto frame_count = cne->msc - ps->last_msc;
 		int frame_time = (int)((cne->ust - ps->last_msc_instant) / frame_count);
-		rolling_avg_push(ps->frame_time, frame_time);
-		log_trace("Frame count %lu, frame time: %d us, rolling average: %lf us, "
+		render_statistics_add_vblank_time_sample(&ps->render_stats, frame_time);
+		log_trace("Frame count %lu, frame time: %d us, rolling average: %u us, "
 		          "msc: %" PRIu64 ", offset: %d us",
-		          frame_count, frame_time, rolling_avg_get_avg(ps->frame_time),
-		          cne->ust, (int)drift);
+		          frame_count, frame_time,
+		          render_statistics_get_vblank_time(&ps->render_stats), cne->ust,
+		          (int)drift);
 	} else if (drift > 1000000 && ps->frame_pacing) {
 		// This is the first MSC event we receive, let's check if the timestamps
 		// align with the monotonic clock. If not, disable frame pacing because we
@ -1560,7 +1587,7 @@ handle_present_complete_notify(session_t *ps, xcb_present_complete_notify_event_
 	ps->last_msc_instant = cne->ust;
 	ps->last_msc = cne->msc;
 	if (ps->redraw_needed) {
-		schedule_render(ps);
+		schedule_render(ps, true);
 	}
 }

@ -1651,6 +1678,8 @@ static void tmout_unredir_callback(EV_P attr_unused, ev_timer *w, int revents at
 }

 static void fade_timer_callback(EV_P attr_unused, ev_timer *w, int revents attr_unused) {
+	// TODO(yshui): do we still need the fade timer? we queue redraw automatically in
+	// draw_callback_impl if animation is running.
 	session_t *ps = session_ptr(w, fade_timer);
 	queue_redraw(ps);
 }
@ -1778,12 +1807,6 @@ static void draw_callback_impl(EV_P_ session_t *ps, int revents attr_unused) {
 		}
 		log_trace("Render end");

-		struct timespec now;
-		clock_gettime(CLOCK_MONOTONIC, &now);
-		uint64_t now_us =
-		    (uint64_t)now.tv_sec * 1000000ULL + (uint64_t)now.tv_nsec / 1000;
-		ps->last_render = now_us;
-
 		ps->first_frame = false;
 		paint++;
 		if (ps->o.benchmark && paint >= ps->o.benchmark) {
@ -2004,8 +2027,7 @@ static session_t *session_init(int argc, char **argv, Display *dpy,
 	pixman_region32_init(&ps->screen_reg);

 	// TODO(yshui) investigate what's the best window size
-	ps->render_stats = rolling_max_new(128);
-	ps->frame_time = rolling_avg_new(16);
+	render_statistics_init(&ps->render_stats, 128);

 	ps->pending_reply_tail = &ps->pending_reply_head;

@ -2642,8 +2664,7 @@ static void session_destroy(session_t *ps) {
 	free(ps->o.glx_fshader_win_str);
 	x_free_randr_info(ps);

-	rolling_avg_destroy(ps->frame_time);
-	rolling_max_destroy(ps->render_stats);
+	render_statistics_destroy(&ps->render_stats);

 	// Release custom window shaders
 	free(ps->o.window_shader_fg);
--- a/src/statistics.c
+++ b/src/statistics.c
@ -0,0 +1,100 @@
+//! Rendering statistics
+//!
+//! Tracks how long it takes to render a frame, for measuring performance, and for pacing
+//! the frames.
+
+#include "statistics.h"
+#include "log.h"
+#include "utils.h"
+
+void render_statistics_init(struct render_statistics *rs, int window_size) {
+	*rs = (struct render_statistics){0};
+
+	rolling_window_init(&rs->render_times, window_size);
+	rolling_quantile_init_with_tolerance(&rs->render_time_quantile, window_size,
+	                                     /* q */ 0.98, /* tolerance */ 0.01);
+}
+
+void render_statistics_add_vblank_time_sample(struct render_statistics *rs, int time_us) {
+	auto sample_sd = sqrt(cumulative_mean_and_var_get_var(&rs->vblank_time_us));
+	auto current_estimate = render_statistics_get_vblank_time(rs);
+	if (current_estimate != 0 && fabs((double)time_us - current_estimate) > sample_sd * 3) {
+		// Deviated from the mean by more than 3 sigma (p < 0.003)
+		log_debug("vblank time outlier: %d %f %f", time_us, rs->vblank_time_us.mean,
+		          cumulative_mean_and_var_get_var(&rs->vblank_time_us));
+		// An outlier sample, this could mean things like refresh rate changes, so
+		// we reset the statistics. This could also be benign, but we like to be
+		// cautious.
+		cumulative_mean_and_var_init(&rs->vblank_time_us);
+	}
+
+	if (rs->vblank_time_us.mean != 0) {
+		auto nframes_in_10_seconds =
+		    (unsigned int)(10. * 1000000. / rs->vblank_time_us.mean);
+		if (rs->vblank_time_us.n > 20 && rs->vblank_time_us.n > nframes_in_10_seconds) {
+			// We collected 10 seconds worth of samples, we assume the
+			// estimated refresh rate is stable. We will still reset the
+			// statistics if we get an outlier sample though, see above.
+			return;
+		}
+	}
+	cumulative_mean_and_var_update(&rs->vblank_time_us, time_us);
+}
+
+void render_statistics_add_render_time_sample(struct render_statistics *rs, int time_us) {
+	int oldest;
+	if (rolling_window_push_back(&rs->render_times, time_us, &oldest)) {
+		rolling_quantile_pop_front(&rs->render_time_quantile, oldest);
+	}
+
+	rolling_quantile_push_back(&rs->render_time_quantile, time_us);
+}
+
+/// How much time budget we should give to the backend for rendering, in microseconds.
+///
+/// A `divisor` is also returned, indicating the target framerate. The divisor is
+/// the number of vblanks we should wait between each frame. A divisor of 1 means
+/// full framerate, 2 means half framerate, etc.
+unsigned int
+render_statistics_get_budget(struct render_statistics *rs, unsigned int *divisor) {
+	if (rs->render_times.nelem < rs->render_times.window_size) {
+		// No valid render time estimates yet. Assume maximum budget.
+		*divisor = 1;
+		return UINT_MAX;
+	}
+
+	// N-th percentile of render times, see render_statistics_init for N.
+	auto render_time_percentile =
+	    rolling_quantile_estimate(&rs->render_time_quantile, &rs->render_times);
+	auto vblank_time_us = render_statistics_get_vblank_time(rs);
+	if (vblank_time_us == 0) {
+		// We don't have a good estimate of the vblank time yet, so we
+		// assume we can finish in one vblank.
+		*divisor = 1;
+	} else {
+		*divisor =
+		    (unsigned int)(render_time_percentile / rs->vblank_time_us.mean + 1);
+	}
+	return (unsigned int)render_time_percentile;
+}
+
+unsigned int render_statistics_get_vblank_time(struct render_statistics *rs) {
+	if (rs->vblank_time_us.n <= 20 || rs->vblank_time_us.mean < 100) {
+		// Not enough samples yet, or the vblank time is too short to be
+		// meaningful. Assume maximum budget.
+		return 0;
+	}
+	return (unsigned int)rs->vblank_time_us.mean;
+}
+
+void render_statistics_reset(struct render_statistics *rs) {
+	rolling_window_reset(&rs->render_times);
+	rolling_quantile_reset(&rs->render_time_quantile);
+	rs->vblank_time_us = (struct cumulative_mean_and_var){0};
+}
+
+void render_statistics_destroy(struct render_statistics *rs) {
+	render_statistics_reset(rs);
+	rolling_window_destroy(&rs->render_times);
+	rolling_quantile_destroy(&rs->render_time_quantile);
+}
--- a/src/statistics.h
+++ b/src/statistics.h
@ -0,0 +1,33 @@
+#pragma once
+
+#include "utils.h"
+
+#define NTIERS (3)
+
+struct render_statistics {
+	/// Rolling window of rendering times (in us) and the tiers they belong to.
+	/// We keep track of the tiers because the vblank time estimate can change over
+	/// time.
+	struct rolling_window render_times;
+	/// Estimate the 95-th percentile of rendering times
+	struct rolling_quantile render_time_quantile;
+	/// Time between each vblanks
+	struct cumulative_mean_and_var vblank_time_us;
+};
+
+void render_statistics_init(struct render_statistics *rs, int window_size);
+void render_statistics_reset(struct render_statistics *rs);
+void render_statistics_destroy(struct render_statistics *rs);
+
+void render_statistics_add_vblank_time_sample(struct render_statistics *rs, int time_us);
+void render_statistics_add_render_time_sample(struct render_statistics *rs, int time_us);
+
+/// How much time budget we should give to the backend for rendering, in microseconds.
+///
+/// A `divisor` is also returned, indicating the target framerate. The divisor is
+/// the number of vblanks we should wait between each frame. A divisor of 1 means
+/// full framerate, 2 means half framerate, etc.
+unsigned int
+render_statistics_get_budget(struct render_statistics *rs, unsigned int *divisor);
+
+unsigned int render_statistics_get_vblank_time(struct render_statistics *rs);
--- a/src/utils.c
+++ b/src/utils.c
@ -48,7 +48,43 @@ int next_power_of_two(int n) {
 	return n;
 }

-/// Track the rolling maximum of a stream of integers.
+void rolling_window_destroy(struct rolling_window *rw) {
+	free(rw->elem);
+	rw->elem = NULL;
+}
+
+void rolling_window_reset(struct rolling_window *rw) {
+	rw->nelem = 0;
+	rw->elem_head = 0;
+}
+
+void rolling_window_init(struct rolling_window *rw, int size) {
+	rw->elem = ccalloc(size, int);
+	rw->window_size = size;
+	rolling_window_reset(rw);
+}
+
+int rolling_window_pop_front(struct rolling_window *rw) {
+	assert(rw->nelem > 0);
+	auto ret = rw->elem[rw->elem_head];
+	rw->elem_head = (rw->elem_head + 1) % rw->window_size;
+	rw->nelem--;
+	return ret;
+}
+
+bool rolling_window_push_back(struct rolling_window *rw, int val, int *front) {
+	bool full = rw->nelem == rw->window_size;
+	if (full) {
+		*front = rolling_window_pop_front(rw);
+	}
+	rw->elem[(rw->elem_head + rw->nelem) % rw->window_size] = val;
+	rw->nelem++;
+	return full;
+}
+
+/// Track the maximum member of a FIFO queue of integers. Integers are pushed to the back
+/// and popped from the front, the maximum of the current members in the queue is
+/// tracked.
 struct rolling_max {
 	/// A priority queue holding the indices of the maximum element candidates.
 	/// The head of the queue is the index of the maximum element.
@ -59,32 +95,26 @@ struct rolling_max {
 	/// it's called the "original" indices.
 	int *p;
 	int p_head, np;
-
-	/// The elemets
-	int *elem;
-	int elem_head, nelem;
-
-	int window_size;
+	/// The maximum number of in flight elements.
+	int capacity;
 };

 void rolling_max_destroy(struct rolling_max *rm) {
-	free(rm->elem);
 	free(rm->p);
 	free(rm);
 }

-struct rolling_max *rolling_max_new(int size) {
+struct rolling_max *rolling_max_new(int capacity) {
 	auto rm = ccalloc(1, struct rolling_max);
 	if (!rm) {
 		return NULL;
 	}

-	rm->p = ccalloc(size, int);
-	rm->elem = ccalloc(size, int);
-	rm->window_size = size;
-	if (!rm->p || !rm->elem) {
+	rm->p = ccalloc(capacity, int);
+	if (!rm->p) {
 		goto err;
 	}
+	rm->capacity = capacity;

 	return rm;

@ -96,33 +126,21 @@ err:
 void rolling_max_reset(struct rolling_max *rm) {
 	rm->p_head = 0;
 	rm->np = 0;
-	rm->nelem = 0;
-	rm->elem_head = 0;
 }

-void rolling_max_push(struct rolling_max *rm, int val) {
-#define IDX(n) ((n) % rm->window_size)
-	if (rm->nelem == rm->window_size) {
-		auto old_head = rm->elem_head;
-		// Discard the oldest element.
-		// rm->elem.pop_front();
-		rm->nelem--;
-		rm->elem_head = IDX(rm->elem_head + 1);
-
-		// Remove discarded element from the priority queue too.
-		assert(rm->np);
-		if (rm->p[rm->p_head] == old_head) {
-			// rm->p.pop_front()
-			rm->p_head = IDX(rm->p_head + 1);
-			rm->np--;
-		}
+#define IDX(n) ((n) % rm->capacity)
+/// Remove the oldest element in the window. The caller must maintain the list of elements
+/// themselves, i.e. the behavior is undefined if `front` does not 1match the oldest
+/// element.
+void rolling_max_pop_front(struct rolling_max *rm, int front) {
+	if (rm->p[rm->p_head] == front) {
+		// rm->p.pop_front()
+		rm->p_head = IDX(rm->p_head + 1);
+		rm->np--;
 	}
+}

-	// Add the new element to the queue.
-	// rm->elem.push_back(val)
-	rm->elem[IDX(rm->elem_head + rm->nelem)] = val;
-	rm->nelem++;
-
+void rolling_max_push_back(struct rolling_max *rm, int val) {
 	// Update the prority queue.
 	// Remove all elements smaller than the new element from the queue. Because
 	// the new element will become the maximum element before them, and since they
@ -130,7 +148,7 @@ void rolling_max_push(struct rolling_max *rm, int val) {
 	// element, so they will never become the maximum element.
 	while (rm->np) {
 		int p_tail = IDX(rm->p_head + rm->np - 1);
-		if (rm->elem[rm->p[p_tail]] > val) {
+		if (rm->p[p_tail] > val) {
 			break;
 		}
 		// rm->p.pop_back()
@ -138,108 +156,119 @@ void rolling_max_push(struct rolling_max *rm, int val) {
 	}
 	// Add the new element to the end of the queue.
 	// rm->p.push_back(rm->start_index + rm->nelem - 1)
-	rm->p[IDX(rm->p_head + rm->np)] = IDX(rm->elem_head + rm->nelem - 1);
+	assert(rm->np < rm->capacity);
+	rm->p[IDX(rm->p_head + rm->np)] = val;
 	rm->np++;
-#undef IDX
 }
+#undef IDX

 int rolling_max_get_max(struct rolling_max *rm) {
 	if (rm->np == 0) {
 		return INT_MIN;
 	}
-	return rm->elem[rm->p[rm->p_head]];
+	return rm->p[rm->p_head];
 }

 TEST_CASE(rolling_max_test) {
 #define NELEM 15
+	struct rolling_window queue;
+	rolling_window_init(&queue, 3);
 	auto rm = rolling_max_new(3);
 	const int data[NELEM] = {1, 2, 3, 1, 4, 5, 2, 3, 6, 5, 4, 3, 2, 0, 0};
 	const int expected_max[NELEM] = {1, 2, 3, 3, 4, 5, 5, 5, 6, 6, 6, 5, 4, 3, 2};
 	int max[NELEM] = {0};
 	for (int i = 0; i < NELEM; i++) {
-		rolling_max_push(rm, data[i]);
+		int front;
+		bool full = rolling_window_push_back(&queue, data[i], &front);
+		if (full) {
+			rolling_max_pop_front(rm, front);
+		}
+		rolling_max_push_back(rm, data[i]);
 		max[i] = rolling_max_get_max(rm);
 	}
+	rolling_window_destroy(&queue);
+	rolling_max_destroy(rm);
 	TEST_TRUE(memcmp(max, expected_max, sizeof(max)) == 0);
 #undef NELEM
 }

-/// A rolling average of a stream of integers.
-struct rolling_avg {
-	/// The sum of the elements in the window.
-	int64_t sum;
+// Find the k-th smallest element in an array.
+int quickselect(int *elems, int nelem, int k) {
+	int l = 0, r = nelem;        // [l, r) is the range of candidates
+	while (l != r) {
+		int pivot = elems[l];
+		int i = l, j = r;
+		while (i < j) {
+			while (i < j && elems[--j] >= pivot) {
+			}
+			elems[i] = elems[j];
+			while (i < j && elems[++i] <= pivot) {
+			}
+			elems[j] = elems[i];
+		}
+		elems[i] = pivot;

-	/// The elements in the window.
-	int *elem;
-	int head, nelem;
+		if (i == k) {
+			break;
+		}

-	int window_size;
-};
-
-struct rolling_avg *rolling_avg_new(int size) {
-	auto rm = ccalloc(1, struct rolling_avg);
-	if (!rm) {
-		return NULL;
+		if (i < k) {
+			l = i + 1;
+		} else {
+			r = i;
+		}
 	}
-
-	rm->elem = ccalloc(size, int);
-	rm->window_size = size;
-	if (!rm->elem) {
-		free(rm);
-		return NULL;
-	}
-
-	return rm;
+	return elems[k];
 }

-void rolling_avg_destroy(struct rolling_avg *rm) {
-	free(rm->elem);
-	free(rm);
+void rolling_quantile_init(struct rolling_quantile *rq, int capacity, int mink, int maxk) {
+	*rq = (struct rolling_quantile){0};
+	rq->tmp_buffer = malloc(sizeof(int) * (size_t)capacity);
+	rq->capacity = capacity;
+	rq->min_target_rank = mink;
+	rq->max_target_rank = maxk;
 }

-void rolling_avg_reset(struct rolling_avg *ra) {
-	ra->sum = 0;
-	ra->nelem = 0;
-	ra->head = 0;
+void rolling_quantile_init_with_tolerance(struct rolling_quantile *rq, int window_size,
+                                          double target, double tolerance) {
+	rolling_quantile_init(rq, window_size, (int)((target - tolerance) * window_size),
+	                      (int)((target + tolerance) * window_size));
 }

-void rolling_avg_push(struct rolling_avg *ra, int val) {
-	if (ra->nelem == ra->window_size) {
-		// Discard the oldest element.
-		// rm->elem.pop_front();
-		ra->sum -= ra->elem[ra->head % ra->window_size];
-		ra->nelem--;
-		ra->head = (ra->head + 1) % ra->window_size;
+void rolling_quantile_reset(struct rolling_quantile *rq) {
+	rq->current_rank = 0;
+	rq->estimate = 0;
+}
+
+void rolling_quantile_destroy(struct rolling_quantile *rq) {
+	free(rq->tmp_buffer);
+}
+
+int rolling_quantile_estimate(struct rolling_quantile *rq, struct rolling_window *elements) {
+	if (rq->current_rank < rq->min_target_rank || rq->current_rank > rq->max_target_rank) {
+		if (elements->nelem != elements->window_size) {
+			return INT_MIN;
+		}
+		// Re-estimate the quantile.
+		assert(elements->nelem <= rq->capacity);
+		rolling_window_copy_to_array(elements, rq->tmp_buffer);
+		const int target_rank =
+		    rq->min_target_rank + (rq->max_target_rank - rq->min_target_rank) / 2;
+		rq->estimate = quickselect(rq->tmp_buffer, elements->nelem, target_rank);
+		rq->current_rank = target_rank;
 	}
-
-	// Add the new element to the queue.
-	// rm->elem.push_back(val)
-	ra->elem[(ra->head + ra->nelem) % ra->window_size] = val;
-	ra->sum += val;
-	ra->nelem++;
+	return rq->estimate;
 }

-double rolling_avg_get_avg(struct rolling_avg *ra) {
-	if (ra->nelem == 0) {
-		return 0;
+void rolling_quantile_push_back(struct rolling_quantile *rq, int x) {
+	if (x <= rq->estimate) {
+		rq->current_rank++;
 	}
-	return (double)ra->sum / (double)ra->nelem;
 }

-TEST_CASE(rolling_avg_test) {
-#define NELEM 15
-	auto rm = rolling_avg_new(3);
-	const int data[NELEM] = {1, 2, 3, 1, 4, 5, 2, 3, 6, 5, 4, 3, 2, 0, 0};
-	const double expected_avg[NELEM] = {
-	    1,          1.5,        2, 2, 8.0 / 3.0, 10.0 / 3.0, 11.0 / 3.0, 10.0 / 3.0,
-	    11.0 / 3.0, 14.0 / 3.0, 5, 4, 3,         5.0 / 3.0,  2.0 / 3.0};
-	double avg[NELEM] = {0};
-	for (int i = 0; i < NELEM; i++) {
-		rolling_avg_push(rm, data[i]);
-		avg[i] = rolling_avg_get_avg(rm);
-	}
-	for (int i = 0; i < NELEM; i++) {
-		TEST_EQUAL(avg[i], expected_avg[i]);
+void rolling_quantile_pop_front(struct rolling_quantile *rq, int x) {
+	if (x <= rq->estimate) {
+		rq->current_rank--;
 	}
 }

--- a/src/utils.h
+++ b/src/utils.h
@ -17,6 +17,7 @@
 #include <time.h>

 #include "compiler.h"
+#include "log.h"
 #include "types.h"

 #define ARR_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
@ -289,20 +290,97 @@ static inline void free_charpp(char **str) {
 ///
 int next_power_of_two(int n);

+struct rolling_window {
+	int *elem;
+	int elem_head, nelem;
+	int window_size;
+};
+
+void rolling_window_destroy(struct rolling_window *rw);
+void rolling_window_reset(struct rolling_window *rw);
+void rolling_window_init(struct rolling_window *rw, int size);
+int rolling_window_pop_front(struct rolling_window *rw);
+bool rolling_window_push_back(struct rolling_window *rw, int val, int *front);
+
+/// Copy the contents of the rolling window to an array. The array is assumed to
+/// have enough space to hold the contents of the rolling window.
+static inline void attr_unused rolling_window_copy_to_array(struct rolling_window *rw,
+                                                            int *arr) {
+	// The length from head to the end of the array
+	auto head_len = (size_t)(rw->window_size - rw->elem_head);
+	if (head_len >= (size_t)rw->nelem) {
+		memcpy(arr, rw->elem + rw->elem_head, sizeof(int) * (size_t)rw->nelem);
+	} else {
+		auto tail_len = (size_t)((size_t)rw->nelem - head_len);
+		memcpy(arr, rw->elem + rw->elem_head, sizeof(int) * head_len);
+		memcpy(arr + head_len, rw->elem, sizeof(int) * tail_len);
+	}
+}
+
 struct rolling_max;

-struct rolling_max *rolling_max_new(int window_size);
+struct rolling_max *rolling_max_new(int capacity);
 void rolling_max_destroy(struct rolling_max *rm);
 void rolling_max_reset(struct rolling_max *rm);
-void rolling_max_push(struct rolling_max *rm, int val);
+void rolling_max_pop_front(struct rolling_max *rm, int front);
+void rolling_max_push_back(struct rolling_max *rm, int val);
 int rolling_max_get_max(struct rolling_max *rm);

-struct rolling_avg;
-struct rolling_avg *rolling_avg_new(int window_size);
-void rolling_avg_destroy(struct rolling_avg *ra);
-void rolling_avg_reset(struct rolling_avg *ra);
-void rolling_avg_push(struct rolling_avg *ra, int val);
-double rolling_avg_get_avg(struct rolling_avg *ra);
+/// Estimate the mean and variance of random variable X using Welford's online
+/// algorithm.
+struct cumulative_mean_and_var {
+	double mean;
+	double m2;
+	unsigned int n;
+};
+
+static inline attr_unused void
+cumulative_mean_and_var_init(struct cumulative_mean_and_var *cmv) {
+	*cmv = (struct cumulative_mean_and_var){0};
+}
+
+static inline attr_unused void
+cumulative_mean_and_var_update(struct cumulative_mean_and_var *cmv, double x) {
+	if (cmv->n == UINT_MAX) {
+		// We have too many elements, let's keep the mean and variance.
+		return;
+	}
+	cmv->n++;
+	double delta = x - cmv->mean;
+	cmv->mean += delta / (double)cmv->n;
+	cmv->m2 += delta * (x - cmv->mean);
+}
+
+static inline attr_unused double
+cumulative_mean_and_var_get_var(struct cumulative_mean_and_var *cmv) {
+	if (cmv->n < 2) {
+		return 0;
+	}
+	return cmv->m2 / (double)(cmv->n - 1);
+}
+
+// Find the k-th smallest element in an array.
+int quickselect(int *elems, int nelem, int k);
+
+/// A naive quantile estimator.
+///
+/// Estimates the N-th percentile of a random variable X in a sliding window.
+struct rolling_quantile {
+	int current_rank;
+	int min_target_rank, max_target_rank;
+	int estimate;
+	int capacity;
+	int *tmp_buffer;
+};
+
+void rolling_quantile_init(struct rolling_quantile *rq, int capacity, int mink, int maxk);
+void rolling_quantile_init_with_tolerance(struct rolling_quantile *rq, int window_size,
+                                          double target, double tolerance);
+void rolling_quantile_reset(struct rolling_quantile *rq);
+void rolling_quantile_destroy(struct rolling_quantile *rq);
+int rolling_quantile_estimate(struct rolling_quantile *rq, struct rolling_window *elements);
+void rolling_quantile_push_back(struct rolling_quantile *rq, int x);
+void rolling_quantile_pop_front(struct rolling_quantile *rq, int x);

 // Some versions of the Android libc do not have timespec_get(), use
 // clock_gettime() instead.