diff --git a/docs/architecture.md b/docs/architecture.md
index c1f70262..958c6a25 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -20,6 +20,7 @@ Clustered mode is shown/discussed here. Single mode is analogous to having a sin
 * By default, a single, separate thread is used to receive HTTP requests across the socket.
   * When at least one worker thread is available for work, a connection is accepted and placed in this request buffer
   * This thread waits for entire HTTP requests to be received over the connection
+    * The time spent waiting for the HTTP request body to be received is exposed to the Rack app as `env['puma.request_body_wait']` (milliseconds)
   * Once received, the connection is pushed into the "todo" set
 * Worker threads pop work off the "todo" set for processing
   * The thread processes the request via the rack application (which generates the HTTP response)
diff --git a/docs/deployment.md b/docs/deployment.md
index 39ac26bb..d44198b7 100644
--- a/docs/deployment.md
+++ b/docs/deployment.md
@@ -54,6 +54,22 @@ up resources.
 Watch your CPU utilization over time and aim for about 70% on average. This means
 you've got capacity still but aren't starving threads.
 
+**Measuring utilization**
+
+Using a timestamp header from an upstream proxy server (eg. nginx or haproxy), it's
+possible to get an indication of how long requests have been waiting for a Puma
+thread to become available.
+
+* Have your upstream proxy set a header with the time it received the request:
+    * nginx: `proxy_set_header X-Request-Start "${msec}";`
+    * haproxy: `http-request set-header X-Request-Start "%t";`
+* In your Rack middleware, determine the amount of time elapsed since `X-Request-Start`.
+* To improve accuracy, you will want to subtract time spent waiting for slow clients:
+    * `env['puma.request_body_wait']` contains the number of milliseconds Puma spent
+      waiting for the client to send the request body.
+    * haproxy: `%Th` (TLS handshake time) and `%Ti` (idle time before request) can
+      can also be added as headers.
+
 ## Daemonizing
 
 I prefer to not daemonize my servers and use something like `runit` or `upstart` to
diff --git a/lib/puma/client.rb b/lib/puma/client.rb
index bcf86f88..9df5a4ae 100644
--- a/lib/puma/client.rb
+++ b/lib/puma/client.rb
@@ -175,8 +175,7 @@ module Puma
             rest = io.read
             rest = rest[2..-1] if rest.start_with?("\r\n")
             @buffer = rest.empty? ? nil : rest
-            @requests_served += 1
-            @ready = true
+            set_ready
             return true
           end
 
@@ -224,8 +223,7 @@ module Puma
         unless chunk
           @body.close
           @buffer = nil
-          @requests_served += 1
-          @ready = true
+          set_ready
           raise EOFError
         end
 
@@ -234,6 +232,8 @@ module Puma
     end
 
     def setup_body
+      @body_read_start = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond)
+
       if @env[HTTP_EXPECT] == CONTINUE
         # TODO allow a hook here to check the headers before
         # going forward
@@ -258,8 +258,7 @@ module Puma
       unless cl
         @buffer = body.empty? ? nil : body
         @body = EmptyBody
-        @requests_served += 1
-        @ready = true
+        set_ready
         return true
       end
 
@@ -268,8 +267,7 @@ module Puma
       if remain <= 0
         @body = StringIO.new(body)
         @buffer = nil
-        @requests_served += 1
-        @ready = true
+        set_ready
         return true
       end
 
@@ -304,8 +302,7 @@ module Puma
       # No data means a closed socket
       unless data
         @buffer = nil
-        @requests_served += 1
-        @ready = true
+        set_ready
         raise EOFError
       end
 
@@ -341,8 +338,7 @@ module Puma
         # No data means a closed socket
         unless data
           @buffer = nil
-          @requests_served += 1
-          @ready = true
+          set_ready
           raise EOFError
         end
 
@@ -419,8 +415,7 @@ module Puma
       unless chunk
         @body.close
         @buffer = nil
-        @requests_served += 1
-        @ready = true
+        set_ready
         raise EOFError
       end
 
@@ -429,8 +424,7 @@ module Puma
       if remain <= 0
         @body.rewind
         @buffer = nil
-        @requests_served += 1
-        @ready = true
+        set_ready
         return true
       end
 
@@ -439,6 +433,14 @@ module Puma
       false
     end
 
+    def set_ready
+      if @body_read_start
+        @env['puma.request_body_wait'] = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond) - @body_read_start
+      end
+      @requests_served += 1
+      @ready = true
+    end
+
     def write_400
       begin
         @io << ERROR_400_RESPONSE
diff --git a/test/test_puma_server.rb b/test/test_puma_server.rb
index 27772572..fcd70866 100644
--- a/test/test_puma_server.rb
+++ b/test/test_puma_server.rb
@@ -817,4 +817,44 @@ EOF
 
     assert_equal "HTTP/1.0 200 OK\r\nX-Empty-Header: \r\n\r\n", data
   end
+
+  def test_request_body_wait
+    request_body_wait = nil
+    @server.app = proc { |env|
+      request_body_wait = env['puma.request_body_wait']
+      [204, {}, []]
+    }
+
+    @server.add_tcp_listener @host, @port
+    @server.run
+
+    sock = TCPSocket.new @host, @server.connected_port
+    sock << "POST / HTTP/1.1\r\nHost: test.com\r\nContent-Type: text/plain\r\nContent-Length: 5\r\n\r\nh"
+    sleep 1
+    sock << "ello"
+
+    sock.gets
+
+    assert request_body_wait >= 1000
+  end
+
+  def test_request_body_wait_chunked
+    request_body_wait = nil
+    @server.app = proc { |env|
+      request_body_wait = env['puma.request_body_wait']
+      [204, {}, []]
+    }
+
+    @server.add_tcp_listener @host, @port
+    @server.run
+
+    sock = TCPSocket.new @host, @server.connected_port
+    sock << "GET / HTTP/1.1\r\nConnection: close\r\nTransfer-Encoding: chunked\r\n\r\n1\r\nh\r\n"
+    sleep 1
+    sock << "4\r\nello\r\n0\r\n"
+
+    sock.gets
+
+    assert request_body_wait >= 1000
+  end
 end