Skip to content

Commit

Permalink
Avoid allocating intermediary strings when read/readpartial are passe…
Browse files Browse the repository at this point in the history
…d an outbuf

This accounts for a significant number of string allocations when reading rubygems, but we can avoid that in many places by only copying into the outbuf when present
  • Loading branch information
segiddins committed Dec 21, 2023
1 parent 2561e12 commit 8ad02d4
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 40 deletions.
105 changes: 66 additions & 39 deletions ext/zlib/zlib.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ static void zstream_expand_buffer_into(struct zstream*, unsigned long);
static int zstream_expand_buffer_non_stream(struct zstream *z);
static void zstream_append_buffer(struct zstream*, const Bytef*, long);
static VALUE zstream_detach_buffer(struct zstream*);
static VALUE zstream_shift_buffer(struct zstream*, long);
static VALUE zstream_shift_buffer(struct zstream*, long, VALUE);
static void zstream_buffer_ungets(struct zstream*, const Bytef*, unsigned long);
static void zstream_buffer_ungetbyte(struct zstream*, int);
static void zstream_append_input(struct zstream*, const Bytef*, long);
Expand Down Expand Up @@ -170,8 +170,8 @@ static void gzfile_check_footer(struct gzfile*, VALUE outbuf);
static void gzfile_write(struct gzfile*, Bytef*, long);
static long gzfile_read_more(struct gzfile*, VALUE outbuf);
static void gzfile_calc_crc(struct gzfile*, VALUE);
static VALUE gzfile_read(struct gzfile*, long);
static VALUE gzfile_read_all(struct gzfile*);
static VALUE gzfile_read(struct gzfile*, long, VALUE);
static VALUE gzfile_read_all(struct gzfile*, VALUE);
static void gzfile_ungets(struct gzfile*, const Bytef*, long);
static void gzfile_ungetbyte(struct gzfile*, int);
static VALUE gzfile_writer_end_run(VALUE);
Expand Down Expand Up @@ -820,19 +820,31 @@ zstream_detach_buffer(struct zstream *z)
}

static VALUE
zstream_shift_buffer(struct zstream *z, long len)
zstream_shift_buffer(struct zstream *z, long len, VALUE dst)
{
VALUE dst;
char *bufptr;
long buflen = ZSTREAM_BUF_FILLED(z);

if (buflen <= len) {
return zstream_detach_buffer(z);
if (NIL_P(dst) || (!ZSTREAM_IS_FINISHED(z) && !ZSTREAM_IS_GZFILE(z) &&
rb_block_given_p())) {
return zstream_detach_buffer(z);
} else {
bufptr = RSTRING_PTR(z->buf);
rb_str_resize(dst, buflen);
memcpy(RSTRING_PTR(dst), bufptr, buflen);
}
buflen = 0;
} else {
bufptr = RSTRING_PTR(z->buf);
if (NIL_P(dst)) {
dst = rb_str_new(bufptr, len);
} else {
rb_str_resize(dst, len);
memcpy(RSTRING_PTR(dst), bufptr, len);
}
buflen -= len;
}

bufptr = RSTRING_PTR(z->buf);
dst = rb_str_new(bufptr, len);
buflen -= len;
memmove(bufptr, bufptr + len, buflen);
rb_str_set_len(z->buf, buflen);
z->stream.next_out = (Bytef*)RSTRING_END(z->buf);
Expand Down Expand Up @@ -2874,33 +2886,46 @@ gzfile_newstr(struct gzfile *gz, VALUE str)
}

static long
gzfile_fill(struct gzfile *gz, long len)
gzfile_fill(struct gzfile *gz, long len, VALUE outbuf)
{
if (len < 0)
rb_raise(rb_eArgError, "negative length %ld given", len);
if (len == 0)
return 0;
while (!ZSTREAM_IS_FINISHED(&gz->z) && ZSTREAM_BUF_FILLED(&gz->z) < len) {
gzfile_read_more(gz, Qnil);
gzfile_read_more(gz, outbuf);
}
if (GZFILE_IS_FINISHED(gz)) {
if (!(gz->z.flags & GZFILE_FLAG_FOOTER_FINISHED)) {
gzfile_check_footer(gz, Qnil);
gzfile_check_footer(gz, outbuf);
}
return -1;
}
return len < ZSTREAM_BUF_FILLED(&gz->z) ? len : ZSTREAM_BUF_FILLED(&gz->z);
}

static VALUE
gzfile_read(struct gzfile *gz, long len)
gzfile_read(struct gzfile *gz, long len, VALUE outbuf)
{
VALUE dst;

len = gzfile_fill(gz, len);
if (len == 0) return rb_str_new(0, 0);
if (len < 0) return Qnil;
dst = zstream_shift_buffer(&gz->z, len);
len = gzfile_fill(gz, len, outbuf);

if (len < 0) {
if (!NIL_P(outbuf))
rb_str_resize(outbuf, 0);
return Qnil;
}
if (len == 0) {
if (NIL_P(outbuf))
return rb_str_new(0, 0);
else {
rb_str_resize(outbuf, 0);
return outbuf;
}
}

dst = zstream_shift_buffer(&gz->z, len, outbuf);
if (!NIL_P(dst)) gzfile_calc_crc(gz, dst);
return dst;
}
Expand Down Expand Up @@ -2933,7 +2958,7 @@ gzfile_readpartial(struct gzfile *gz, long len, VALUE outbuf)
rb_raise(rb_eEOFError, "end of file reached");
}

dst = zstream_shift_buffer(&gz->z, len);
dst = zstream_shift_buffer(&gz->z, len, outbuf);
gzfile_calc_crc(gz, dst);

if (!NIL_P(outbuf)) {
Expand All @@ -2945,17 +2970,19 @@ gzfile_readpartial(struct gzfile *gz, long len, VALUE outbuf)
}

static VALUE
gzfile_read_all(struct gzfile *gz)
gzfile_read_all(struct gzfile *gz, VALUE dst)
{
VALUE dst;

while (!ZSTREAM_IS_FINISHED(&gz->z)) {
gzfile_read_more(gz, Qnil);
gzfile_read_more(gz, dst);
}
if (GZFILE_IS_FINISHED(gz)) {
if (!(gz->z.flags & GZFILE_FLAG_FOOTER_FINISHED)) {
gzfile_check_footer(gz, Qnil);
gzfile_check_footer(gz, dst);
}
if (!NIL_P(dst)) {
rb_str_resize(dst, 0);
return dst;
}
return rb_str_new(0, 0);
}

Expand Down Expand Up @@ -2993,15 +3020,15 @@ gzfile_getc(struct gzfile *gz)
de = (unsigned char *)ds + GZFILE_CBUF_CAPA;
(void)rb_econv_convert(gz->ec, &sp, se, &dp, de, ECONV_PARTIAL_INPUT|ECONV_AFTER_OUTPUT);
rb_econv_check_error(gz->ec);
dst = zstream_shift_buffer(&gz->z, sp - ss);
dst = zstream_shift_buffer(&gz->z, sp - ss, Qnil);
gzfile_calc_crc(gz, dst);
rb_str_resize(cbuf, dp - ds);
return cbuf;
}
else {
buf = gz->z.buf;
len = rb_enc_mbclen(RSTRING_PTR(buf), RSTRING_END(buf), gz->enc);
dst = gzfile_read(gz, len);
dst = gzfile_read(gz, len, Qnil);
if (NIL_P(dst)) return dst;
return gzfile_newstr(gz, dst);
}
Expand Down Expand Up @@ -3906,7 +3933,7 @@ rb_gzreader_s_zcat(int argc, VALUE *argv, VALUE klass)
if (!buf) {
buf = rb_str_new(0, 0);
}
tmpbuf = gzfile_read_all(get_gzfile(obj));
tmpbuf = gzfile_read_all(get_gzfile(obj), Qnil);
rb_str_cat(buf, RSTRING_PTR(tmpbuf), RSTRING_LEN(tmpbuf));
}

Expand Down Expand Up @@ -4008,19 +4035,19 @@ static VALUE
rb_gzreader_read(int argc, VALUE *argv, VALUE obj)
{
struct gzfile *gz = get_gzfile(obj);
VALUE vlen;
VALUE vlen, outbuf;
long len;

rb_scan_args(argc, argv, "01", &vlen);
rb_scan_args(argc, argv, "02", &vlen, &outbuf);
if (NIL_P(vlen)) {
return gzfile_read_all(gz);
return gzfile_read_all(gz, outbuf);
}

len = NUM2INT(vlen);
if (len < 0) {
rb_raise(rb_eArgError, "negative length %ld given", len);
}
return gzfile_read(gz, len);
return gzfile_read(gz, len, outbuf);
}

/*
Expand Down Expand Up @@ -4093,7 +4120,7 @@ rb_gzreader_getbyte(VALUE obj)
struct gzfile *gz = get_gzfile(obj);
VALUE dst;

dst = gzfile_read(gz, 1);
dst = gzfile_read(gz, 1, Qnil);
if (!NIL_P(dst)) {
dst = INT2FIX((unsigned int)(RSTRING_PTR(dst)[0]) & 0xff);
}
Expand Down Expand Up @@ -4214,7 +4241,7 @@ gzreader_skip_linebreaks(struct gzfile *gz)
}
}

str = zstream_shift_buffer(&gz->z, n - 1);
str = zstream_shift_buffer(&gz->z, n - 1, Qnil);
gzfile_calc_crc(gz, str);
}

Expand All @@ -4235,7 +4262,7 @@ gzreader_charboundary(struct gzfile *gz, long n)
if (l < n) {
int n_bytes = rb_enc_precise_mbclen(p, e, gz->enc);
if (MBCLEN_NEEDMORE_P(n_bytes)) {
if ((l = gzfile_fill(gz, n + MBCLEN_NEEDMORE_LEN(n_bytes))) > 0) {
if ((l = gzfile_fill(gz, n + MBCLEN_NEEDMORE_LEN(n_bytes), Qnil)) > 0) {
return l;
}
}
Expand Down Expand Up @@ -4287,10 +4314,10 @@ gzreader_gets(int argc, VALUE *argv, VALUE obj)

if (NIL_P(rs)) {
if (limit < 0) {
dst = gzfile_read_all(gz);
dst = gzfile_read_all(gz, Qnil);
if (RSTRING_LEN(dst) == 0) return Qnil;
}
else if ((n = gzfile_fill(gz, limit)) <= 0) {
else if ((n = gzfile_fill(gz, limit, Qnil)) <= 0) {
return Qnil;
}
else {
Expand All @@ -4300,7 +4327,7 @@ gzreader_gets(int argc, VALUE *argv, VALUE obj)
else {
n = limit;
}
dst = zstream_shift_buffer(&gz->z, n);
dst = zstream_shift_buffer(&gz->z, n, Qnil);
if (NIL_P(dst)) return dst;
gzfile_calc_crc(gz, dst);
dst = gzfile_newstr(gz, dst);
Expand All @@ -4327,7 +4354,7 @@ gzreader_gets(int argc, VALUE *argv, VALUE obj)
while (ZSTREAM_BUF_FILLED(&gz->z) < rslen) {
if (ZSTREAM_IS_FINISHED(&gz->z)) {
if (ZSTREAM_BUF_FILLED(&gz->z) > 0) gz->lineno++;
return gzfile_read(gz, rslen);
return gzfile_read(gz, rslen, Qnil);
}
gzfile_read_more(gz, Qnil);
}
Expand Down Expand Up @@ -4364,7 +4391,7 @@ gzreader_gets(int argc, VALUE *argv, VALUE obj)
}

gz->lineno++;
dst = gzfile_read(gz, n);
dst = gzfile_read(gz, n, Qnil);
if (NIL_P(dst)) return dst;
if (rspara) {
gzreader_skip_linebreaks(gz);
Expand Down
25 changes: 24 additions & 1 deletion test/zlib/test_zlib.rb
Original file line number Diff line number Diff line change
Expand Up @@ -988,6 +988,25 @@ def test_read
assert_raise(ArgumentError) { f.read(-1) }
assert_equal(str, f.read)
end

Zlib::GzipReader.open(t.path) do |f|
s = "".b

assert_raise(ArgumentError) { f.read(-1, s) }

assert_same s, f.read(1, s)
assert_equal "\xE3".b, s

assert_same s, f.read(2, s)
assert_equal "\x81\x82".b, s

assert_same s, f.read(6, s)
assert_equal "\u3044\u3046".b, s

assert_nil f.read(1, s)
assert_equal "".b, s
assert_predicate f, :eof?
end
}
end

Expand All @@ -1002,10 +1021,14 @@ def test_readpartial

Zlib::GzipReader.open(t.path) do |f|
s = "".dup
f.readpartial(3, s)
assert_same s, f.readpartial(3, s)
assert("foo".start_with?(s))

assert_raise(ArgumentError) { f.readpartial(-1) }

assert_same s, f.readpartial(3, s)

assert_predicate f, :eof?
end
}
end
Expand Down

0 comments on commit 8ad02d4

Please sign in to comment.