Skip to content

Commit

Permalink
Avoid using map based buffer
Browse files Browse the repository at this point in the history
  • Loading branch information
syucream committed Jul 26, 2020
1 parent f524f61 commit 68671df
Show file tree
Hide file tree
Showing 10 changed files with 208 additions and 49 deletions.
3 changes: 2 additions & 1 deletion arrow/json/writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ import (
"encoding/json"
"errors"
"fmt"
"io"

"github.com/apache/arrow/go/arrow"
"github.com/apache/arrow/go/arrow/array"
"io"
)

var (
Expand Down
18 changes: 9 additions & 9 deletions arrow/json/writer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,15 +211,15 @@ func TestToGo(t *testing.T) {
// uint8 TODO support this case
// []uint8 will be converted base64-ed string
/*
{
data: func() *array.Data {
b := array.NewUint8Builder(pool)
b.AppendValues([]uint8{0, 1, 2}, nil)
return b.NewUint8Array().Data()
}(),
expected: []uint8{0, 1, 2},
err: nil,
},
{
data: func() *array.Data {
b := array.NewUint8Builder(pool)
b.AppendValues([]uint8{0, 1, 2}, nil)
return b.NewUint8Array().Data()
}(),
expected: []uint8{0, 1, 2},
err: nil,
},
*/

// uint16
Expand Down
3 changes: 2 additions & 1 deletion columnifier/parquet.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@ package columnifier

import (
"bytes"
"io/ioutil"

"github.com/reproio/columnify/arrow/json"
"github.com/reproio/columnify/record"
"github.com/xitongsys/parquet-go/marshal"
"io/ioutil"

"github.com/reproio/columnify/parquet"
"github.com/reproio/columnify/schema"
Expand Down
28 changes: 11 additions & 17 deletions record/arrow.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ package record

import (
"fmt"
"github.com/apache/arrow/go/arrow"
"github.com/apache/arrow/go/arrow/array"
"github.com/apache/arrow/go/arrow/memory"
"strconv"
"time"

"github.com/apache/arrow/go/arrow"
"github.com/apache/arrow/go/arrow/array"
)

type WrappedRecord struct {
Expand All @@ -19,24 +19,18 @@ func NewWrappedRecord(b *array.RecordBuilder) *WrappedRecord {
}
}

func formatMapToArrowRecord(s *arrow.Schema, maps []map[string]interface{}) (*WrappedRecord, error) {
pool := memory.NewGoAllocator()
b := array.NewRecordBuilder(pool, s)
defer b.Release()

for _, m := range maps {
for i, f := range s.Fields() {
if v, ok := m[f.Name]; ok {
if _, err := formatMapToArrowField(b.Field(i), f.Type, f.Nullable, v); err != nil {
return nil, err
}
} else {
b.Field(i).AppendNull()
func formatMapToArrowRecord(b *array.RecordBuilder, m map[string]interface{}) (*array.RecordBuilder, error) {
for i, f := range b.Schema().Fields() {
if v, ok := m[f.Name]; ok {
if _, err := formatMapToArrowField(b.Field(i), f.Type, f.Nullable, v); err != nil {
return nil, err
}
} else {
b.Field(i).AppendNull()
}
}

return NewWrappedRecord(b), nil
return b, nil
}

func formatMapToArrowStruct(b *array.StructBuilder, s *arrow.StructType, m map[string]interface{}) (*array.StructBuilder, error) {
Expand Down
16 changes: 11 additions & 5 deletions record/arrow_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -430,17 +430,23 @@ func TestNewArrowSchemaFromAvroSchema(t *testing.T) {
},
}

pool := memory.NewGoAllocator()
for _, c := range cases {
expectedRecord := c.expected(c.schema)

actual, err := formatMapToArrowRecord(c.schema.ArrowSchema, c.input)
b := array.NewRecordBuilder(pool, c.schema.ArrowSchema)
defer b.Release()

if err != c.err {
t.Errorf("expected: %v, but actual: %v\n", c.err, err)
for _, v := range c.input {
_, err := formatMapToArrowRecord(b, v)
if err != c.err {
t.Errorf("expected: %v, but actual: %v\n", c.err, err)
}
}

if !reflect.DeepEqual(actual, expectedRecord) {
t.Errorf("values: expected: %v, but actual: %v\n", expectedRecord, actual)
r := NewWrappedRecord(b)
if !reflect.DeepEqual(r, expectedRecord) {
t.Errorf("values: expected: %v, but actual: %v\n", expectedRecord, r)
}
}
}
26 changes: 24 additions & 2 deletions record/avro.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import (
"bytes"
"fmt"

"github.com/apache/arrow/go/arrow/array"
"github.com/apache/arrow/go/arrow/memory"
"github.com/reproio/columnify/schema"

"github.com/linkedin/goavro/v2"
Expand Down Expand Up @@ -57,10 +59,30 @@ func FormatAvroToMap(data []byte) ([]map[string]interface{}, error) {
}

func FormatAvroToArrow(s *schema.IntermediateSchema, data []byte) (*WrappedRecord, error) {
maps, err := FormatAvroToMap(data)
pool := memory.NewGoAllocator()
b := array.NewRecordBuilder(pool, s.ArrowSchema)
defer b.Release()

r, err := goavro.NewOCFReader(bytes.NewReader(data))
if err != nil {
return nil, err
}

return formatMapToArrowRecord(s.ArrowSchema, maps)
for r.Scan() {
v, err := r.Read()
if err != nil {
return nil, err
}

m, mapOk := v.(map[string]interface{})
if !mapOk {
return nil, fmt.Errorf("invalid value %v: %w", v, ErrUnconvertibleRecord)
}

if _, err = formatMapToArrowRecord(b, flattenAvroUnion(m)); err != nil {
return nil, err
}
}

return NewWrappedRecord(b), nil
}
59 changes: 57 additions & 2 deletions record/csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ import (
"strconv"
"strings"

"github.com/apache/arrow/go/arrow/array"
"github.com/apache/arrow/go/arrow/memory"

"github.com/reproio/columnify/schema"
)

Expand Down Expand Up @@ -89,10 +92,62 @@ func FormatCsvToMap(s *schema.IntermediateSchema, data []byte, delimiter delimit
}

func FormatCsvToArrow(s *schema.IntermediateSchema, data []byte, delimiter delimiter) (*WrappedRecord, error) {
maps, err := FormatCsvToMap(s, data, delimiter)
pool := memory.NewGoAllocator()
b := array.NewRecordBuilder(pool, s.ArrowSchema)
defer b.Release()

names, err := getFieldNamesFromSchema(s)
if err != nil {
return nil, err
}

return formatMapToArrowRecord(s.ArrowSchema, maps)
reader := csv.NewReader(strings.NewReader(string(data)))
reader.Comma = rune(delimiter)

numFields := len(names)
for {
values, err := reader.Read()
if err == io.EOF {
break
}
if err != nil {
return nil, err
}

if numFields != len(values) {
return nil, fmt.Errorf("incompleted value %v: %w", values, ErrUnconvertibleRecord)
}

e := make(map[string]interface{})
for i, v := range values {
// bool
if v != "0" && v != "1" {
if vv, err := strconv.ParseBool(v); err == nil {
e[names[i]] = vv
continue
}
}

// int
if vv, err := strconv.ParseInt(v, 10, 64); err == nil {
e[names[i]] = vv
continue
}

// float
if vv, err := strconv.ParseFloat(v, 64); err == nil {
e[names[i]] = vv
continue
}

// others; to string
e[names[i]] = v
}

if _, err := formatMapToArrowRecord(b, e); err != nil {
return nil, err
}
}

return NewWrappedRecord(b), nil
}
26 changes: 22 additions & 4 deletions record/jsonl.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ import (
"encoding/json"
"strings"

"github.com/apache/arrow/go/arrow/array"
"github.com/apache/arrow/go/arrow/memory"

"github.com/reproio/columnify/schema"
)

Expand All @@ -29,10 +32,25 @@ func FormatJsonlToMap(data []byte) ([]map[string]interface{}, error) {
}

func FormatJsonlToArrow(s *schema.IntermediateSchema, data []byte) (*WrappedRecord, error) {
maps, err := FormatJsonlToMap(data)
if err != nil {
return nil, err
pool := memory.NewGoAllocator()
b := array.NewRecordBuilder(pool, s.ArrowSchema)
defer b.Release()

for _, l := range strings.Split(string(data), "\n") {
if l == "" {
// skip blank line
continue
}

var e map[string]interface{}
if err := json.Unmarshal([]byte(l), &e); err != nil {
return nil, err
}

if _, err := formatMapToArrowRecord(b, e); err != nil {
return nil, err
}
}

return formatMapToArrowRecord(s.ArrowSchema, maps)
return NewWrappedRecord(b), nil
}
49 changes: 45 additions & 4 deletions record/ltsv.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ import (
"strconv"
"strings"

"github.com/apache/arrow/go/arrow/array"
"github.com/apache/arrow/go/arrow/memory"

"github.com/reproio/columnify/schema"

"github.com/Songmu/go-ltsv"
Expand Down Expand Up @@ -54,10 +57,48 @@ func FormatLtsvToMap(data []byte) ([]map[string]interface{}, error) {
}

func FormatLtsvToArrow(s *schema.IntermediateSchema, data []byte) (*WrappedRecord, error) {
maps, err := FormatLtsvToMap(data)
if err != nil {
return nil, err
pool := memory.NewGoAllocator()
b := array.NewRecordBuilder(pool, s.ArrowSchema)
defer b.Release()

for _, l := range strings.Split(string(data), "\n") {
v := map[string]string{}

err := ltsv.Unmarshal([]byte(l), &v)
if err != nil {
return nil, err
}

m := make(map[string]interface{})
for k, v := range v {
// bool
if v != "0" && v != "1" {
if vv, err := strconv.ParseBool(v); err == nil {
m[k] = vv
continue
}
}

// int
if vv, err := strconv.ParseInt(v, 10, 64); err == nil {
m[k] = vv
continue
}

// float
if vv, err := strconv.ParseFloat(v, 64); err == nil {
m[k] = vv
continue
}

// others; to string
m[k] = v
}

if _, err := formatMapToArrowRecord(b, m); err != nil {
return nil, err
}
}

return formatMapToArrowRecord(s.ArrowSchema, maps)
return NewWrappedRecord(b), nil
}
29 changes: 25 additions & 4 deletions record/msgpack.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ import (
"fmt"
"io"

"github.com/apache/arrow/go/arrow/array"
"github.com/apache/arrow/go/arrow/memory"

"github.com/reproio/columnify/schema"

"github.com/vmihailenco/msgpack/v4"
Expand Down Expand Up @@ -34,10 +37,28 @@ func FormatMsgpackToMap(data []byte) ([]map[string]interface{}, error) {
}

func FormatMsgpackToArrow(s *schema.IntermediateSchema, data []byte) (*WrappedRecord, error) {
maps, err := FormatMsgpackToMap(data)
if err != nil {
return nil, err
pool := memory.NewGoAllocator()
b := array.NewRecordBuilder(pool, s.ArrowSchema)
defer b.Release()

d := msgpack.NewDecoder(bytes.NewReader(data))
for {
arr, err := d.DecodeInterface()
if err == io.EOF {
break
} else if err != nil {
return nil, err
}

m, mapOk := arr.(map[string]interface{})
if !mapOk {
return nil, fmt.Errorf("invalid input %v: %w", arr, ErrUnconvertibleRecord)
}

if _, err = formatMapToArrowRecord(b, m); err != nil {
return nil, err
}
}

return formatMapToArrowRecord(s.ArrowSchema, maps)
return NewWrappedRecord(b), nil
}

0 comments on commit 68671df

Please sign in to comment.