Skip to content

Commit

Permalink
refactor(flex): Replace the Adhoc csv reader with Arrow CSV reader (#…
Browse files Browse the repository at this point in the history
…3154)

1. Use Arrow CSV Reader to replace current adhoc csv reader, to support
more configurable options in `bulk_load.yaml`.
2. Introduce `CSVFragmentLoader`, `BasicFragmentLoader` for
`MutablePropertyFragment`.

With this PR merged, `MutablePropertyFragment` will support loading
fragment from csv with options:
- delimeter: default '|'
- header_row: default true
- quoting: default false
- quoting_char: default '"'
- escaping: default false
- escaping_char: default'\\'
- batch_size: the batch size of when reading file into memory, default
1MB.
- batch_reader: default false. If set to true,
`arrow::csv::StreamingReader` will be used to parse the input file.
Otherwise, `arrow::TableReader` will be used.


With this PR merged, the performance of graph loading will be improved. 
The Adhoc Reader denote the current implemented csv parser, 1,2,4,8
denotes the parallelism of graph loading, i.e. how many labels of
vertex/edge are concurrently processed.

Note that TableReader is around 10x faster than StreamingReader. The
possible reason could be the multi-threading is used.
See [arrow-csv-doc](https://arrow.apache.org/docs/cpp/csv.html) for
details.


### SF30
| Reader | Phase | 1 | 2 | 4 | 8 |
| --------- | -------------- | ---------- |---------- |----------
|---------- |
| Adhoc Reader | ReadFile\+LoadGraph |805s|	468s|	349s|	313s|
| Adhoc Reader | Serialization | 126s|	126s|	126s|	126s|
| Adhoc Reader  | **Total** |931s|	594s|	475s|	439s|
| Table Reader |  ReadFile | 9s	|9s	|9s|	9s|
| Table Reader | LoadGraph |455s|	280s|	211s|	182s|
| Table Reader |Serialization |126s|	126s|	126s|	126s|
| Table Reader | **Total** | 600s|	415s|	346s|	317s|
| Streaming Reader | ReadFile |91s|	91s|	91s|	91s|
| Streaming Reader | LoadGraph | 555s|	289s|	196s|	149s|
| Streaming Reader | Serialization |126s|	126s|	126s|	126s|
| Streaming Reader | **Total** | 772s|	506s|	413s|	366s|

### SF100
| Reader | Phase | 1 | 2 | 4 | 8 |
| --------- | -------------- | ---------- |---------- |----------
|---------- |
| Adhoc Reader | ReadFile\+LoadGraph |2720s|	1548s|	1176s|	948s|
| Adhoc Reader | Serialization | 409s|	409s|	409s|	409s|
| Adhoc Reader  | **Total** | 3129s|	1957s|	1585s|	1357s|
| Table Reader |  ReadFile |24s|	24s|	24s|	24s|
| Table Reader | LoadGraph |1576s|	949s|	728s|	602s|
| Table Reader |Serialization |409s|	409s|	409s|	409s|
| Table Reader | **Total** | 2009s|	1382s|	1161s|	1035s|
| Streaming Reader | ReadFile |300s|	300s|	300s|	300s|
| Streaming Reader | LoadGraph | 1740s|	965s|	669s|	497s|
| Streaming Reader | Serialization | 409s|	409s|	409s|	409s|
| Streaming Reader | **Total** | 2539s|	1674s|	1378s|	1206s|
### SF300
| Reader | Phase | 1 | 2 | 4 | 8 |
| --------- | -------------- | ---------- |---------- |----------
|---------- |
| Adhoc Reader | ReadFile\+LoadGraph | 8260s|	4900s	|3603s	|2999s|
| Adhoc Reader | Serialization | 1201s |	1201s|	1201s	|1201s|
| Adhoc Reader  | **Total** | 9461s|	6101s | 4804s	|4200s|
| Table Reader |  ReadFile | 73s	|73s|	96s|	96s|
| Table Reader | LoadGraph |4650s|	2768s|	2155s	|1778s|
| Table Reader |Serialization | 1201s |	1201s|	1201s	|1201s|
| Table Reader | **Total** | 5924s|	4042s|	3452s|	3075s|
| Streaming Reader | ReadFile | 889s |889s | 889s| 889s|
| Streaming Reader | LoadGraph | 5589s|	3005s|	2200s|	1712s|
| Streaming Reader | Serialization | 1201s| 1201s| 1201s |1201s |
| Streaming Reader | **Total** | 7679s	| 5095s |4290s| 	3802s|


FIx #3116
  • Loading branch information
zhanglei1949 authored Sep 8, 2023
1 parent d4d5506 commit 9f1d776
Show file tree
Hide file tree
Showing 42 changed files with 2,843 additions and 1,083 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/flex.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
runs-on: ubuntu-20.04
if: ${{ github.repository == 'alibaba/GraphScope' }}
container:
image: registry.cn-hongkong.aliyuncs.com/graphscope/hqps-server-base:v0.0.4
image: registry.cn-hongkong.aliyuncs.com/graphscope/hqps-server-base:v0.0.6
steps:
- uses: actions/checkout@v3

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/hqps-db-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
runs-on: ubuntu-20.04
if: ${{ github.repository == 'alibaba/GraphScope' }}
container:
image: registry.cn-hongkong.aliyuncs.com/graphscope/hqps-server-base:v0.0.4
image: registry.cn-hongkong.aliyuncs.com/graphscope/hqps-server-base:v0.0.6
steps:
- uses: actions/checkout@v3

Expand Down
2 changes: 1 addition & 1 deletion flex/.devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
{
"name": "GraphScope",
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
"image": "registry.cn-hongkong.aliyuncs.com/graphscope/hqps-server-base:v0.0.4",
"image": "registry.cn-hongkong.aliyuncs.com/graphscope/hqps-server-base:v0.0.6",
// Features to add to the dev container. More info: https://containers.dev/features.
"features": {
"ghcr.io/devcontainers/features/common-utils:2": {
Expand Down
15 changes: 15 additions & 0 deletions flex/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../)
set(DEFAULT_BUILD_TYPE "Release")

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -mno-avx512f -fPIC")
set(CMAKE_CXX_FLAGS_DEBUG "-g3 -O0")


add_compile_definitions(FLEX_VERSION="${FLEX_VERSION}")
Expand Down Expand Up @@ -61,6 +62,20 @@ find_package(Boost REQUIRED COMPONENTS system filesystem
# required by folly
context program_options regex thread)

#find arrow----------------------------------------------------------------------
include("cmake/FindArrow.cmake")
if (NOT ARROW_FOUND)
message(FATAL_ERROR "arrow not found, please install the arrow library")
else ()
include_directories(SYSTEM ${ARROW_INCLUDE_DIRS})
if (TARGET arrow_shared)
set(ARROW_SHARED_LIB arrow_shared)
endif()
if (TARGET arrow_static)
set(ARROW_STATIC_LIB arrow_static)
endif()
endif ()

# Find Doxygen
if (BUILD_DOC)
find_package(Doxygen)
Expand Down
34 changes: 19 additions & 15 deletions flex/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ ARG CI=false
SHELL ["/bin/bash", "-c"]


RUN apt update && apt -y install locales && locale-gen en_US.UTF-8
RUN apt-get update && apt-get -y install locales && locale-gen en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8
Expand All @@ -15,29 +15,32 @@ ENV TZ=Asia/Shanghai
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

# install dependencies
RUN apt install -y \
ninja-build ragel libhwloc-dev libnuma-dev libpciaccess-dev vim wget \
git g++ libgoogle-glog-dev cmake libopenmpi-dev default-jdk libcrypto++-dev \
libboost-all-dev libxml2-dev
RUN apt install -y xfslibs-dev libgnutls28-dev liblz4-dev maven openssl pkg-config \
libsctp-dev gcc make python3 systemtap-sdt-dev libtool libyaml-cpp-dev \
libc-ares-dev stow libfmt-dev diffutils valgrind doxygen python3-pip net-tools
RUN apt-get update && apt-get install -y vim wget \
git g++ libgoogle-glog-dev cmake libopenmpi-dev default-jdk \
libboost-all-dev
RUN apt-get install -y maven openssl \
gcc make python3 libyaml-cpp-dev \
libc-ares-dev doxygen python3-pip net-tools curl

# install libgrape-lite
RUN cd /root && \
RUN cd /tmp && \
git clone https://github.com/alibaba/libgrape-lite.git -b v0.3.2 --single-branch && cd libgrape-lite && \
mkdir build && cd build && cmake .. && make -j && make install
mkdir build && cd build && cmake .. && make -j && make install && rm -rf /tmp/libgrape-lite

RUN cp /usr/local/lib/libgrape-lite.so /usr/lib/libgrape-lite.so

RUN git clone https://github.com/alibaba/hiactor.git -b v0.1.1 --single-branch && cd hiactor && \
RUN cd /tmp && git clone https://github.com/alibaba/hiactor.git -b v0.1.1 --single-branch && cd hiactor && \
git submodule update --init --recursive && ./seastar/seastar/install-dependencies.sh && mkdir build && cd build && \
cmake -DHiactor_DEMOS=OFF -DHiactor_TESTING=OFF -DHiactor_DPDK=OFF -DHiactor_CXX_DIALECT=gnu++17 -DSeastar_CXX_FLAGS="-DSEASTAR_DEFAULT_ALLOCATOR -mno-avx512" .. && \
make -j && make install
make -j && make install && rm -rf /tmp/hiactor

#install protobuf
RUN apt-get install -y protobuf-compiler libprotobuf-dev

#install arrow
RUN cd /tmp && apt-get install -y -V ca-certificates lsb-release wget && \
curl -o apache-arrow-apt-source-latest.deb https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb && \
apt-get install -y ./apache-arrow-apt-source-latest.deb && \
apt-get update && apt-get install -y libarrow-dev=6.0.1-1

RUN apt-get install -y sudo

# Add graphscope user with user id 1001
Expand All @@ -49,8 +52,9 @@ USER graphscope
WORKDIR /home/graphscope

RUN curl -sf -L https://static.rust-lang.org/rustup.sh | \
sh -s -- -y --profile minimal && \
sh -s -- -y --profile minimal --default-toolchain=1.70.0 && \
chmod +x "$HOME/.cargo/env" && \
echo "$source $HOME/.cargo/env" >> ~/.bashrc && \
source "$HOME/.cargo/env" && \
echo "1.70.0" > rust-toolchain && \
bash -c "rustup component add rustfmt"
Loading

0 comments on commit 9f1d776

Please sign in to comment.