diff --git a/ext/http11/http11.c b/ext/http11/http11.c index 0d2c79ab..88c0311b 100644 --- a/ext/http11/http11.c +++ b/ext/http11/http11.c @@ -9,26 +9,11 @@ #include "http11_parser.h" #include #include "tst.h" -#include - -typedef struct BMHSearchState { - size_t occ[UCHAR_MAX+1]; - size_t htotal; - unsigned char *needle; - size_t nlen; - size_t skip; - size_t max_find; - - size_t *found_at; - size_t nfound; -} BMHSearchState; static VALUE mMongrel; static VALUE cHttpParser; -static VALUE cBMHSearch; static VALUE cURIClassifier; static VALUE eHttpParserError; -static VALUE eBMHSearchError; #define id_handler_map rb_intern("@handler_map") #define id_http_body rb_intern("@http_body") @@ -192,294 +177,6 @@ void header_done(void *data, const char *at, size_t length) } -inline size_t max(size_t a, size_t b) { return a>b ? a : b; } - -void BMHSearch_free(void *data) { - BMHSearchState *S = (BMHSearchState *)data; - - if(data) { - if(S->needle) free(S->needle); - if(S->found_at) free(S->found_at); - free(data); - } -} - - -VALUE BMHSearch_alloc(VALUE klass) -{ - VALUE obj; - - BMHSearchState *S = ALLOC_N(BMHSearchState, 1); - - obj = Data_Wrap_Struct(klass, NULL, BMHSearch_free, S); - - return obj; -} - -/** - * call-seq: - * BMHSearch.new(needle, max_find) -> searcher - * - * Prepares the needle for searching and allocates enough space to - * retain max_find locations. BMHSearch will throw an exception - * if you go over the max_find without calling BMHSearch#pop. - */ -VALUE BMHSearch_init(VALUE self, VALUE needle, VALUE max_find) -{ - BMHSearchState *S = NULL; - size_t a, b; - - DATA_GET(self, BMHSearchState, S); - - S->htotal = 0; - S->nfound = 0; - S->skip = 0; - S->nlen = RSTRING(needle)->len; - S->max_find = FIX2INT(max_find); - S->needle = NULL; - S->found_at = NULL; - - if(S->nlen <= 0) rb_raise(eBMHSearchError, "Needle can't be 0 length."); - - // copy it so we don't have to worry about Ruby messing it up - S->needle = ALLOC_N(unsigned char, S->nlen); - memcpy(S->needle, RSTRING(needle)->ptr, S->nlen); - - // setup the number of finds they want - S->found_at = ALLOC_N(size_t, S->max_find); - - /* Preprocess */ - /* Initialize the table to default value */ - for(a=0; aocc[a] = S->nlen; - - /* Then populate it with the analysis of the needle */ - b=S->nlen; - for(a=0; a < S->nlen; a=a+1) - { - b=b-1; - S->occ[S->needle[a]] = b; - } - - return self; -} - -/** - * call-seq: - * searcher.find(haystack) -> nfound - * - * This will look for the needle in the haystack and return the total - * number found so far. The key ingredient is that you can pass in - * as many haystacks as you want as long as they are longer than the - * needle. Each time you call find, it adds to the total and number - * found. The purpose is to allow you to read a large string in chunks - * and find the needle. - * - * The main purpose of the nfound is so that you can periodically call - * BMHSearch.pop to clear the list of found locations before you run - * out of max_finds. - */ -VALUE BMHSearch_find(VALUE self, VALUE hay) -{ - size_t hpos = 0, npos = 0, skip = 0; - unsigned char* haystack = NULL; - size_t hlen = 0, last = 0; - BMHSearchState *S = NULL; - - DATA_GET(self, BMHSearchState, S); - - haystack = RSTRING(hay)->ptr; - hlen = RSTRING(hay)->len; - - /* Sanity checks on the parameters */ - if(S->nlen > hlen) { - rb_raise(eBMHSearchError, "Haystack can't be smaller than needle."); - } else if(S->nlen <= 0) { - rb_raise(eBMHSearchError, "Needle can't be 0 length."); - } else if(!haystack || !S->needle) { - rb_raise(eBMHSearchError, "Corrupt search state. REALLY BAD!"); - } - - /* Check for a trailing remainder, which is only possible if skip > 1 */ - if(S->skip) { - // only scan for what should be the rest of the string - size_t remainder = S->nlen - S->skip; - if(!memcmp(haystack, S->needle + S->skip, remainder)) { - // record a find - S->found_at[S->nfound++] = S->htotal - S->skip; - // move up by the amount found - hpos += remainder; - } - } - - - /* Start searching from the end of S->needle (this is not a typo) */ - hpos = S->nlen-1; - - while(hpos < hlen) { - /* Compare the S->needle backwards, and stop when first mismatch is found */ - npos = S->nlen-1; - - while(hpos < hlen && haystack[hpos] == S->needle[npos]) { - if(npos == 0) { - // found one, log it in found_at, but reduce by the skip if we're at the beginning - S->found_at[S->nfound++] = hpos + S->htotal; - last = hpos; // keep track of the last one we found in this chunk - - if(S->nfound > S->max_find) { - rb_raise(eBMHSearchError, "More than max requested needles found. Use pop."); - } - - /* continue at the next possible spot, from end of S->needle (not typo) */ - hpos += S->nlen + S->nlen - 1; - npos = S->nlen - 1; - continue; - } else { - hpos--; - npos--; - } - } - - // exhausted the string already, done - if(hpos > hlen) break; - - /* Find out how much ahead we can skip based on the byte that was found */ - skip = max(S->nlen - npos, S->occ[haystack[hpos]]); - hpos += skip; - } - - - skip = 0; // skip defaults to 0 - - // don't bother if the string ends in the needle completely - if(S->nfound == 0 || last != hlen - S->nlen) { - // invert the occ array to figure out how much to jump back - size_t back = S->nlen - S->occ[haystack[hlen-1]]; - - // the needle could have an ending char that's also repeated in the needle - // in that case, we have to adjust to search for nlen-1 just in case - if(haystack[hlen-1] == S->needle[S->nlen - 1]) { - back = S->nlen - 1; - } - - int found_it = 0; // defaults to 0 for test at end of it - if(back < S->nlen && back > 0) { - // search for the longest possible prefix - for(hpos = hlen - back; hpos < hlen; hpos++) { - skip = hlen - hpos; - if(!memcmp(haystack+hpos, S->needle, skip)) { - found_it = 1; // make sure we actually found it - break; - } - } - } - // skip will be 1 if nothing was found, so we have to force it 0 - if(!found_it) skip = 0; - } - - // keep track of the skip, it's set to 0 by the code above - S->skip = skip; - // update the total processed so far so indexes will be in sync - S->htotal += hlen; - - return INT2FIX(S->nfound); -} - - -/** - * call-seq: - * searcher.pop -> [1, 2, 3, ..] - * - * Pops off the locations in the TOTAL string (over all haystacks processed) - * clearing the internal buffer for more space and letting you use it. - */ -VALUE BMHSearch_pop(VALUE self) -{ - int i = 0; - BMHSearchState *S = NULL; - VALUE result; - - DATA_GET(self, BMHSearchState, S); - - result = rb_ary_new(); - for(i = 0; i < S->nfound; i++) { - rb_ary_push(result, INT2FIX(S->found_at[i])); - } - - S->nfound = 0; - - return result; -} - - -/** - * call-seq: - * searcher.nfound -> Fixed - * - * The number found so far. - */ -VALUE BMHSearch_nfound(VALUE self) -{ - BMHSearchState *S = NULL; - DATA_GET(self, BMHSearchState, S); - return INT2FIX(S->nfound); -} - -/** - * call-seq: - * searcher.max_find -> Fixed - * - * The current maximum find setting (cannot be changed). - */ -VALUE BMHSearch_max_find(VALUE self) -{ - BMHSearchState *S = NULL; - DATA_GET(self, BMHSearchState, S); - return INT2FIX(S->max_find); -} - -/** - * call-seq: - * searcher.total -> Fixed - * - * The total bytes processed so far. - */ -VALUE BMHSearch_total(VALUE self) -{ - BMHSearchState *S = NULL; - DATA_GET(self, BMHSearchState, S); - return INT2FIX(S->htotal); -} - -/** - * call-seq: - * searcher.needle -> String - * - * A COPY of the internal needle string being used for searching. - */ -VALUE BMHSearch_needle(VALUE self) -{ - BMHSearchState *S = NULL; - DATA_GET(self, BMHSearchState, S); - return rb_str_new(S->needle, S->nlen); -} - -/** - * call-seq: - * searcher.has_trailing? -> Boolean - * - * Tells you whether the last call to BMHSearch#find has possible - * trailing matches. This is mostly for completeness but you - * might use this to adjust the amount of data you're reading - * or the buffer size. - */ -VALUE BMHSearch_has_trailing(VALUE self) -{ - BMHSearchState *S = NULL; - DATA_GET(self, BMHSearchState, S); - return S->skip ? Qtrue : Qfalse; -} - - void HttpParser_free(void *data) { TRACE(); @@ -864,18 +561,6 @@ void Init_http11() DEF_GLOBAL(port_80, "80"); eHttpParserError = rb_define_class_under(mMongrel, "HttpParserError", rb_eIOError); - eBMHSearchError = rb_define_class_under(mMongrel, "BMHSearchError", rb_eIOError); - - cBMHSearch = rb_define_class_under(mMongrel, "BMHSearch", rb_cObject); - rb_define_alloc_func(cBMHSearch, BMHSearch_alloc); - rb_define_method(cBMHSearch, "initialize", BMHSearch_init,2); - rb_define_method(cBMHSearch, "find", BMHSearch_find,1); - rb_define_method(cBMHSearch, "pop", BMHSearch_pop,0); - rb_define_method(cBMHSearch, "nfound", BMHSearch_nfound,0); - rb_define_method(cBMHSearch, "total", BMHSearch_total,0); - rb_define_method(cBMHSearch, "needle", BMHSearch_needle,0); - rb_define_method(cBMHSearch, "max_find", BMHSearch_max_find,0); - rb_define_method(cBMHSearch, "has_trailing?", BMHSearch_has_trailing,0); cHttpParser = rb_define_class_under(mMongrel, "HttpParser", rb_cObject); rb_define_alloc_func(cHttpParser, HttpParser_alloc); diff --git a/lib/mongrel.rb b/lib/mongrel.rb index f9e373b4..aad3a31f 100644 --- a/lib/mongrel.rb +++ b/lib/mongrel.rb @@ -199,20 +199,7 @@ module Mongrel content_length = @params[Const::CONTENT_LENGTH].to_i remain = content_length - @params.http_body.length - if @params.has_key? 'CONTENT_TYPE' - # see if it's multipart, and carve out the boundary for later - @mpart_type, @mpart_boundary = @params['CONTENT_TYPE'].split(/;\s*/) - if @mpart_type and @mpart_boundary and @mpart_boundary.include? "=" - @mpart_boundary = @mpart_boundary.split("=")[1].strip - STDERR.puts "boundary: #{@mpart_boundary}" - @params['MULTIPART_TYPE'] = @mpart_type - @params['MULTIPART_BOUNDARY'] = @mpart_boundary - @search = BMHSearch.new(@mpart_boundary, 100) - end - end - dispatcher.request_begins(@params) if dispatcher - @search.find @params.http_body if @search and @params.http_body.length > @mpart_boundary.length # Some clients (like FF1.0) report 0 for body and then send a body. This will probably truncate them but at least the request goes through usually. if remain <= 0 @@ -235,15 +222,6 @@ module Mongrel read_body(remain, content_length, dispatcher) end - if @search and @body - STDERR.puts "number of boundaries: #{@search.nfound}" - @body.rewind - @search.pop.each do |boundary| - @body.seek(boundary - 2) - STDERR.puts "BOUNDARY at #{boundary}: #{@body.read(@mpart_boundary.length + 2)}" - end - end - @body.rewind if @body end @@ -256,7 +234,6 @@ module Mongrel begin # write the odd sized chunk first @params.http_body = read_socket(remain % Const::CHUNK_SIZE) - @search.find(@params.http_body) if @search remain -= @body.write(@params.http_body) dispatcher.request_progress(@params, remain, total) if dispatcher @@ -265,7 +242,6 @@ module Mongrel until remain <= 0 or @socket.closed? # ASSUME: we are writing to a disk and these writes always write the requested amount @params.http_body = read_socket(Const::CHUNK_SIZE) - @search.find(@params.http_body) if @search remain -= @body.write(@params.http_body) dispatcher.request_progress(@params, remain, total) if dispatcher end diff --git a/test/test_bmhsearch.rb b/test/test_bmhsearch.rb deleted file mode 100644 index 291d35ea..00000000 --- a/test/test_bmhsearch.rb +++ /dev/null @@ -1,150 +0,0 @@ -# Copyright (c) 2005 Zed A. Shaw -# You can redistribute it and/or modify it under the same terms as Ruby. -# -# Additional work donated by contributors. See http://mongrel.rubyforge.org/attributions.html -# for more information. - -require 'test/unit' -require 'mongrel' -require 'http11' -require File.dirname(__FILE__) + "/testhelp.rb" - -include Mongrel - -class BMHSearchTest < Test::Unit::TestCase - - def setup - @needle = "needle" - end - - def teardown - end - - def test_operations - to_find = "this has a needle and a need" - s = BMHSearch.new(@needle, 10) - assert s.needle == @needle, "internal needle does not match needle" - assert_equal s.max_find, 10, "max_found isn't right" - - n = s.find(to_find) - assert_equal n,1,"wrong number found" - assert_equal n,s.nfound, "returned found and nfound don't match" - - f = s.pop - assert f, "didn't get the locations" - assert_equal f.length,n,"wrong number of returned finds" - assert_equal f[0],11,"wrong location returned" - - assert_equal to_find.length, s.total, "total doesn't match searched length" - assert s.has_trailing?, "should have trailing data" - - to_find2 = "le through a needle dude" - n = s.find(to_find2) - - assert_equal n,2,"second find not working" - assert_equal n,s.nfound, "nfound don't match" - - f = s.pop - assert f, "pop after second find not working" - assert_equal f[0],24,"first location of second find not right" - assert_equal f[1],41,"second location of second find not right" - - assert_equal to_find.length+to_find2.length,s.total,"wrong total length" - assert !s.has_trailing?, "should not have trailing" - - assert_raise BMHSearchError do - 11.times { s.find("this has a needle") } - end - end - - def test_boundaries_over_chunks - hay = ["zed is a tool ba", - "ggiethis has the baggieb", - "aggie of baggie-baggie tricks in another ", - "baggie"] - - total = "zed is a tool baggiethis has the baggiebaggie of baggie-baggie tricks in another baggie" - - s = BMHSearch.new("baggie", 20) - - hay.each {|h| s.find(h) } - hay_found = s.pop - - s = BMHSearch.new("baggie", 20) - s.find(total) - total_found = s.pop - - assert_equal hay_found.length, total_found.length, "wrong number of needles found" - - total_found.length.times do |i| - assert_equal total_found[i], hay_found[i], "boundary doesn't match" - end - end - - - def test_fuzzing - begin - has_rfuzz = require 'rfuzz/random' - rescue Object - has_rfuzz = false - end - - if has_rfuzz - r = RFuzz::RandomGenerator.new - needles = r.base64(20, 64).collect {|n| "\r\n" + n.strip } - needles.each do |needle| - next if needle.length == 0 - - nchunks = r.num(10) + 10 - bmh = BMHSearch.new(needle, nchunks+1) - total = "" # used to collect the full string for compare - - # each needle is sprinkled into up to 100 chunks - nchunks.times do - # each chunk is up to 16k in size - chunk = r.bytes(r.num(16 * 1024) + needle.length * 2) - chunk.gsub! needle, "" - - # make about 60% go across boundaries - if r.num(10) < 6 - # this one gets cut in two - cut_at = r.num(needle.length - 1) + 1 - n1 = needle[0 ... cut_at] - n2 = needle[cut_at .. -1] - - assert_equal n1+n2, needle, "oops, messed up breaking the needle" - - last_nfound = bmh.nfound - bmh.find(chunk + n1) - assert bmh.has_trailing?, "should have trailing on #{n1}:#{n2} on chunk length: #{chunk.length}" - assert_equal last_nfound, bmh.nfound, "shouldn't find it yet" - - bmh.find(n2 + chunk) - assert_equal last_nfound+1, bmh.nfound, "should have found the boundary for #{n1}:#{n2} on chunk length: #{chunk.length}" - total << chunk + n1 + n2 + chunk - else - # this one is put in complete - bmh.find(chunk + needle) - bmh.find(chunk) - - total << chunk + needle + chunk - end - end - - tbmh = BMHSearch.new(needle, nchunks+1) - tbmh.find(total) - - assert_equal total.length, bmh.total, "totals don't match" - assert_equal tbmh.nfound, bmh.nfound, "nfound don't match" - - total_found = tbmh.pop - hay_found = bmh.pop - - total_found.length.times do |i| - assert_equal total_found[i], hay_found[i], "boundary doesn't match" - end - end - end - end -end -