From 8ad02d4002159fa945b83cd8d40a68b04b7192cb Mon Sep 17 00:00:00 2001 From: Samuel Giddins Date: Wed, 20 Dec 2023 18:30:06 -0800 Subject: [PATCH] Avoid allocating intermediary strings when read/readpartial are passed an outbuf This accounts for a significant number of string allocations when reading rubygems, but we can avoid that in many places by only copying into the outbuf when present --- ext/zlib/zlib.c | 105 ++++++++++++++++++++++++++--------------- test/zlib/test_zlib.rb | 25 +++++++++- 2 files changed, 90 insertions(+), 40 deletions(-) diff --git a/ext/zlib/zlib.c b/ext/zlib/zlib.c index dc608ee..584cc20 100644 --- a/ext/zlib/zlib.c +++ b/ext/zlib/zlib.c @@ -90,7 +90,7 @@ static void zstream_expand_buffer_into(struct zstream*, unsigned long); static int zstream_expand_buffer_non_stream(struct zstream *z); static void zstream_append_buffer(struct zstream*, const Bytef*, long); static VALUE zstream_detach_buffer(struct zstream*); -static VALUE zstream_shift_buffer(struct zstream*, long); +static VALUE zstream_shift_buffer(struct zstream*, long, VALUE); static void zstream_buffer_ungets(struct zstream*, const Bytef*, unsigned long); static void zstream_buffer_ungetbyte(struct zstream*, int); static void zstream_append_input(struct zstream*, const Bytef*, long); @@ -170,8 +170,8 @@ static void gzfile_check_footer(struct gzfile*, VALUE outbuf); static void gzfile_write(struct gzfile*, Bytef*, long); static long gzfile_read_more(struct gzfile*, VALUE outbuf); static void gzfile_calc_crc(struct gzfile*, VALUE); -static VALUE gzfile_read(struct gzfile*, long); -static VALUE gzfile_read_all(struct gzfile*); +static VALUE gzfile_read(struct gzfile*, long, VALUE); +static VALUE gzfile_read_all(struct gzfile*, VALUE); static void gzfile_ungets(struct gzfile*, const Bytef*, long); static void gzfile_ungetbyte(struct gzfile*, int); static VALUE gzfile_writer_end_run(VALUE); @@ -820,19 +820,31 @@ zstream_detach_buffer(struct zstream *z) } static VALUE -zstream_shift_buffer(struct zstream *z, long len) +zstream_shift_buffer(struct zstream *z, long len, VALUE dst) { - VALUE dst; char *bufptr; long buflen = ZSTREAM_BUF_FILLED(z); if (buflen <= len) { - return zstream_detach_buffer(z); + if (NIL_P(dst) || (!ZSTREAM_IS_FINISHED(z) && !ZSTREAM_IS_GZFILE(z) && + rb_block_given_p())) { + return zstream_detach_buffer(z); + } else { + bufptr = RSTRING_PTR(z->buf); + rb_str_resize(dst, buflen); + memcpy(RSTRING_PTR(dst), bufptr, buflen); + } + buflen = 0; + } else { + bufptr = RSTRING_PTR(z->buf); + if (NIL_P(dst)) { + dst = rb_str_new(bufptr, len); + } else { + rb_str_resize(dst, len); + memcpy(RSTRING_PTR(dst), bufptr, len); + } + buflen -= len; } - - bufptr = RSTRING_PTR(z->buf); - dst = rb_str_new(bufptr, len); - buflen -= len; memmove(bufptr, bufptr + len, buflen); rb_str_set_len(z->buf, buflen); z->stream.next_out = (Bytef*)RSTRING_END(z->buf); @@ -2874,18 +2886,18 @@ gzfile_newstr(struct gzfile *gz, VALUE str) } static long -gzfile_fill(struct gzfile *gz, long len) +gzfile_fill(struct gzfile *gz, long len, VALUE outbuf) { if (len < 0) rb_raise(rb_eArgError, "negative length %ld given", len); if (len == 0) return 0; while (!ZSTREAM_IS_FINISHED(&gz->z) && ZSTREAM_BUF_FILLED(&gz->z) < len) { - gzfile_read_more(gz, Qnil); + gzfile_read_more(gz, outbuf); } if (GZFILE_IS_FINISHED(gz)) { if (!(gz->z.flags & GZFILE_FLAG_FOOTER_FINISHED)) { - gzfile_check_footer(gz, Qnil); + gzfile_check_footer(gz, outbuf); } return -1; } @@ -2893,14 +2905,27 @@ gzfile_fill(struct gzfile *gz, long len) } static VALUE -gzfile_read(struct gzfile *gz, long len) +gzfile_read(struct gzfile *gz, long len, VALUE outbuf) { VALUE dst; - len = gzfile_fill(gz, len); - if (len == 0) return rb_str_new(0, 0); - if (len < 0) return Qnil; - dst = zstream_shift_buffer(&gz->z, len); + len = gzfile_fill(gz, len, outbuf); + + if (len < 0) { + if (!NIL_P(outbuf)) + rb_str_resize(outbuf, 0); + return Qnil; + } + if (len == 0) { + if (NIL_P(outbuf)) + return rb_str_new(0, 0); + else { + rb_str_resize(outbuf, 0); + return outbuf; + } + } + + dst = zstream_shift_buffer(&gz->z, len, outbuf); if (!NIL_P(dst)) gzfile_calc_crc(gz, dst); return dst; } @@ -2933,7 +2958,7 @@ gzfile_readpartial(struct gzfile *gz, long len, VALUE outbuf) rb_raise(rb_eEOFError, "end of file reached"); } - dst = zstream_shift_buffer(&gz->z, len); + dst = zstream_shift_buffer(&gz->z, len, outbuf); gzfile_calc_crc(gz, dst); if (!NIL_P(outbuf)) { @@ -2945,17 +2970,19 @@ gzfile_readpartial(struct gzfile *gz, long len, VALUE outbuf) } static VALUE -gzfile_read_all(struct gzfile *gz) +gzfile_read_all(struct gzfile *gz, VALUE dst) { - VALUE dst; - while (!ZSTREAM_IS_FINISHED(&gz->z)) { - gzfile_read_more(gz, Qnil); + gzfile_read_more(gz, dst); } if (GZFILE_IS_FINISHED(gz)) { if (!(gz->z.flags & GZFILE_FLAG_FOOTER_FINISHED)) { - gzfile_check_footer(gz, Qnil); + gzfile_check_footer(gz, dst); } + if (!NIL_P(dst)) { + rb_str_resize(dst, 0); + return dst; + } return rb_str_new(0, 0); } @@ -2993,7 +3020,7 @@ gzfile_getc(struct gzfile *gz) de = (unsigned char *)ds + GZFILE_CBUF_CAPA; (void)rb_econv_convert(gz->ec, &sp, se, &dp, de, ECONV_PARTIAL_INPUT|ECONV_AFTER_OUTPUT); rb_econv_check_error(gz->ec); - dst = zstream_shift_buffer(&gz->z, sp - ss); + dst = zstream_shift_buffer(&gz->z, sp - ss, Qnil); gzfile_calc_crc(gz, dst); rb_str_resize(cbuf, dp - ds); return cbuf; @@ -3001,7 +3028,7 @@ gzfile_getc(struct gzfile *gz) else { buf = gz->z.buf; len = rb_enc_mbclen(RSTRING_PTR(buf), RSTRING_END(buf), gz->enc); - dst = gzfile_read(gz, len); + dst = gzfile_read(gz, len, Qnil); if (NIL_P(dst)) return dst; return gzfile_newstr(gz, dst); } @@ -3906,7 +3933,7 @@ rb_gzreader_s_zcat(int argc, VALUE *argv, VALUE klass) if (!buf) { buf = rb_str_new(0, 0); } - tmpbuf = gzfile_read_all(get_gzfile(obj)); + tmpbuf = gzfile_read_all(get_gzfile(obj), Qnil); rb_str_cat(buf, RSTRING_PTR(tmpbuf), RSTRING_LEN(tmpbuf)); } @@ -4008,19 +4035,19 @@ static VALUE rb_gzreader_read(int argc, VALUE *argv, VALUE obj) { struct gzfile *gz = get_gzfile(obj); - VALUE vlen; + VALUE vlen, outbuf; long len; - rb_scan_args(argc, argv, "01", &vlen); + rb_scan_args(argc, argv, "02", &vlen, &outbuf); if (NIL_P(vlen)) { - return gzfile_read_all(gz); + return gzfile_read_all(gz, outbuf); } len = NUM2INT(vlen); if (len < 0) { rb_raise(rb_eArgError, "negative length %ld given", len); } - return gzfile_read(gz, len); + return gzfile_read(gz, len, outbuf); } /* @@ -4093,7 +4120,7 @@ rb_gzreader_getbyte(VALUE obj) struct gzfile *gz = get_gzfile(obj); VALUE dst; - dst = gzfile_read(gz, 1); + dst = gzfile_read(gz, 1, Qnil); if (!NIL_P(dst)) { dst = INT2FIX((unsigned int)(RSTRING_PTR(dst)[0]) & 0xff); } @@ -4214,7 +4241,7 @@ gzreader_skip_linebreaks(struct gzfile *gz) } } - str = zstream_shift_buffer(&gz->z, n - 1); + str = zstream_shift_buffer(&gz->z, n - 1, Qnil); gzfile_calc_crc(gz, str); } @@ -4235,7 +4262,7 @@ gzreader_charboundary(struct gzfile *gz, long n) if (l < n) { int n_bytes = rb_enc_precise_mbclen(p, e, gz->enc); if (MBCLEN_NEEDMORE_P(n_bytes)) { - if ((l = gzfile_fill(gz, n + MBCLEN_NEEDMORE_LEN(n_bytes))) > 0) { + if ((l = gzfile_fill(gz, n + MBCLEN_NEEDMORE_LEN(n_bytes), Qnil)) > 0) { return l; } } @@ -4287,10 +4314,10 @@ gzreader_gets(int argc, VALUE *argv, VALUE obj) if (NIL_P(rs)) { if (limit < 0) { - dst = gzfile_read_all(gz); + dst = gzfile_read_all(gz, Qnil); if (RSTRING_LEN(dst) == 0) return Qnil; } - else if ((n = gzfile_fill(gz, limit)) <= 0) { + else if ((n = gzfile_fill(gz, limit, Qnil)) <= 0) { return Qnil; } else { @@ -4300,7 +4327,7 @@ gzreader_gets(int argc, VALUE *argv, VALUE obj) else { n = limit; } - dst = zstream_shift_buffer(&gz->z, n); + dst = zstream_shift_buffer(&gz->z, n, Qnil); if (NIL_P(dst)) return dst; gzfile_calc_crc(gz, dst); dst = gzfile_newstr(gz, dst); @@ -4327,7 +4354,7 @@ gzreader_gets(int argc, VALUE *argv, VALUE obj) while (ZSTREAM_BUF_FILLED(&gz->z) < rslen) { if (ZSTREAM_IS_FINISHED(&gz->z)) { if (ZSTREAM_BUF_FILLED(&gz->z) > 0) gz->lineno++; - return gzfile_read(gz, rslen); + return gzfile_read(gz, rslen, Qnil); } gzfile_read_more(gz, Qnil); } @@ -4364,7 +4391,7 @@ gzreader_gets(int argc, VALUE *argv, VALUE obj) } gz->lineno++; - dst = gzfile_read(gz, n); + dst = gzfile_read(gz, n, Qnil); if (NIL_P(dst)) return dst; if (rspara) { gzreader_skip_linebreaks(gz); diff --git a/test/zlib/test_zlib.rb b/test/zlib/test_zlib.rb index 779c583..a9f71f5 100644 --- a/test/zlib/test_zlib.rb +++ b/test/zlib/test_zlib.rb @@ -988,6 +988,25 @@ def test_read assert_raise(ArgumentError) { f.read(-1) } assert_equal(str, f.read) end + + Zlib::GzipReader.open(t.path) do |f| + s = "".b + + assert_raise(ArgumentError) { f.read(-1, s) } + + assert_same s, f.read(1, s) + assert_equal "\xE3".b, s + + assert_same s, f.read(2, s) + assert_equal "\x81\x82".b, s + + assert_same s, f.read(6, s) + assert_equal "\u3044\u3046".b, s + + assert_nil f.read(1, s) + assert_equal "".b, s + assert_predicate f, :eof? + end } end @@ -1002,10 +1021,14 @@ def test_readpartial Zlib::GzipReader.open(t.path) do |f| s = "".dup - f.readpartial(3, s) + assert_same s, f.readpartial(3, s) assert("foo".start_with?(s)) assert_raise(ArgumentError) { f.readpartial(-1) } + + assert_same s, f.readpartial(3, s) + + assert_predicate f, :eof? end } end