Skip to content

Commit

Permalink
CHYT-700: Include simdjson to clickhouse build
Browse files Browse the repository at this point in the history
19ab39c72bdf4b7a36759d391055c14309003306
  • Loading branch information
buyval01 committed Sep 9, 2024
1 parent 8b593c1 commit a0f725b
Show file tree
Hide file tree
Showing 228 changed files with 32,381 additions and 3 deletions.
1 change: 1 addition & 0 deletions .mapping.json
Original file line number Diff line number Diff line change
Expand Up @@ -744,6 +744,7 @@
"contrib/libs/re2/CMakeLists.darwin-x86_64.txt":"",
"contrib/libs/re2/CMakeLists.linux-x86_64.txt":"",
"contrib/libs/re2/CMakeLists.txt":"",
"contrib/libs/simdjson/CMakeLists.txt":"",
"contrib/libs/snappy/CMakeLists.darwin-arm64.txt":"",
"contrib/libs/snappy/CMakeLists.darwin-x86_64.txt":"",
"contrib/libs/snappy/CMakeLists.linux-x86_64.txt":"",
Expand Down
2 changes: 1 addition & 1 deletion contrib/clickhouse/includes/configs/clickhouse_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#define USE_CASSANDRA 0
#define USE_SENTRY 0
#define USE_GRPC 0
#define USE_SIMDJSON 0
#define USE_SIMDJSON 1
#define USE_RAPIDJSON 1
#define USE_DATASKETCHES 0
#define USE_YAML_CPP 0
Expand Down
4 changes: 4 additions & 0 deletions contrib/clickhouse/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ target_include_directories(contrib-clickhouse-src PRIVATE
${PROJECT_SOURCE_DIR}/contrib/libs/msgpack/include
${PROJECT_SOURCE_DIR}/contrib/libs/pdqsort
${PROJECT_SOURCE_DIR}/contrib/libs/rapidjson/include
${PROJECT_SOURCE_DIR}/contrib/libs/simdjson/include
${PROJECT_SOURCE_DIR}/contrib/libs/sparsehash/src
${PROJECT_SOURCE_DIR}/contrib/libs/zstd/include
${PROJECT_SOURCE_DIR}/contrib/restricted/cityhash-1.0.2
Expand Down Expand Up @@ -121,6 +122,7 @@ target_link_libraries(contrib-clickhouse-src PUBLIC
libs-poco-XML
contrib-libs-rapidjson
contrib-libs-re2
contrib-libs-simdjson
contrib-libs-sparsehash
contrib-libs-wyhash
contrib-libs-xxhash
Expand Down Expand Up @@ -1784,6 +1786,7 @@ target_include_directories(contrib-clickhouse-src.global PRIVATE
${PROJECT_SOURCE_DIR}/contrib/libs/msgpack/include
${PROJECT_SOURCE_DIR}/contrib/libs/pdqsort
${PROJECT_SOURCE_DIR}/contrib/libs/rapidjson/include
${PROJECT_SOURCE_DIR}/contrib/libs/simdjson/include
${PROJECT_SOURCE_DIR}/contrib/libs/sparsehash/src
${PROJECT_SOURCE_DIR}/contrib/libs/zstd/include
${PROJECT_SOURCE_DIR}/contrib/restricted/cityhash-1.0.2
Expand Down Expand Up @@ -1829,6 +1832,7 @@ target_link_libraries(contrib-clickhouse-src.global PUBLIC
libs-poco-XML
contrib-libs-rapidjson
contrib-libs-re2
contrib-libs-simdjson
contrib-libs-sparsehash
contrib-libs-wyhash
contrib-libs-xxhash
Expand Down
2 changes: 1 addition & 1 deletion contrib/clickhouse/src/Common/JSONParsers/SimdJSONParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# include <base/types.h>
# include <Common/Exception.h>
# include <base/defines.h>
# error #include <simdjson.h>
# include <simdjson.h>
# include "ElementTypes.h"

namespace DB
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ const char * auto_config_build[]
"OPENSSL_VERSION", "1.1.1g",
"OPENSSL_IS_BORING_SSL", "1",
"USE_VECTORSCAN", "",
"USE_SIMDJSON", "OFF",
"USE_SIMDJSON", "1",
"USE_ODBC", "",
"USE_GRPC", "",
"USE_LDAP", "",
Expand Down
2 changes: 2 additions & 0 deletions contrib/clickhouse/src/ya.make
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ PEERDIR(
contrib/libs/poco/XML
contrib/libs/rapidjson
contrib/libs/re2
contrib/libs/simdjson
contrib/libs/sparsehash
contrib/libs/wyhash
contrib/libs/xxhash
Expand Down Expand Up @@ -106,6 +107,7 @@ ADDINCL(
contrib/libs/msgpack/include
contrib/libs/pdqsort
contrib/libs/rapidjson/include
contrib/libs/simdjson/include
contrib/libs/sparsehash/src
contrib/libs/zstd/include
contrib/restricted/cityhash-1.0.2
Expand Down
1 change: 1 addition & 0 deletions contrib/libs/CMakeLists.linux-x86_64.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ add_subdirectory(fmath)
add_subdirectory(libdivide)
add_subdirectory(metrohash)
add_subdirectory(msgpack)
add_subdirectory(simdjson)
add_subdirectory(wyhash)
add_subdirectory(h3)
add_subdirectory(liburing)
Expand Down
4 changes: 4 additions & 0 deletions contrib/libs/simdjson/AUTHORS
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# List of authors for copyright purposes, in no particular order
Daniel Lemire
Geoff Langdale
John Keiser
42 changes: 42 additions & 0 deletions contrib/libs/simdjson/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# This file was generated by the YaTool build system (https://github.com/yandex/yatool),
# from a source YaTool build configuration provided in ya.make files.
#
# If the repository supports both CMake and ya build configurations, please modify both of them.
#
# If only CMake build configuration is supported then modify only CMake files and note that only
# simple modifications are allowed like adding source-files to targets or adding simple properties
# like target_include_directories. These modifications will be ported to original ya.make files
# by maintainers. Any complex modifications which can't be easily ported back to the ya build
# system may be rejected.
#
# Please refer to the build instructions in the repository for more information about manual
# changes in this file.

find_package(linux-headers-generic REQUIRED)

add_library(contrib-libs-simdjson)


target_compile_options(contrib-libs-simdjson PRIVATE
-DSIMDJSON_AVX512_ALLOWED=1
-DSIMDJSON_UTF8VALIDATION=1
$<IF:$<CXX_COMPILER_ID:MSVC>,,-Wno-everything>
)

target_include_directories(contrib-libs-simdjson PUBLIC
${PROJECT_SOURCE_DIR}/contrib/libs/simdjson/include
)

target_include_directories(contrib-libs-simdjson PRIVATE
${PROJECT_SOURCE_DIR}/contrib/libs/simdjson/src
)

target_link_libraries(contrib-libs-simdjson PUBLIC
linux-headers-generic::linux-headers-generic
contrib-libs-cxxsupp
)

target_sources(contrib-libs-simdjson PRIVATE
${PROJECT_SOURCE_DIR}/contrib/libs/simdjson/src/simdjson.cpp
)

103 changes: 103 additions & 0 deletions contrib/libs/simdjson/CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
Contributing
============

The simdjson library is an open project written in C++. Contributions are invited. Contributors
agree to the project's license.

We have an extensive list of issues, and contributions toward any of these issues is invited.
Contributions can take the form of code samples, better documentation or design ideas.

In particular, the following contributions are invited:

- The library is focused on performance. Well-documented performance optimization are invited.
- Fixes to known or newly discovered bugs are always welcome. Typically, a bug fix should come with
a test demonstrating that the bug has been fixed.
- The simdjson library is advanced software and maintainability and flexibility are always a
concern. Specific contributions to improve maintainability and flexibility are invited.

We discourage the following types of contributions:

- Code refactoring. We all have our preferences as to how code should be written, but unnecessary
refactoring can waste time and introduce new bugs. If you believe that refactoring is needed, you
first must explain how it helps in concrete terms. Does it improve the performance?
- Applications of new language features for their own sake. Using advanced C++ language constructs
is actually a negative as it may reduce portability (to old compilers, old standard libraries and
systems) and reduce accessibility (to programmers that have not kept up), so it must be offsetted
by clear gains like performance or maintainability. When in doubt, avoid advanced C++ features
(beyond C++11).
- Style formatting. In general, please abstain from reformatting code just to make it look prettier.
Though code formatting is important, it can also be a waste of time if several contributors try to
tweak the code base toward their own preference. Please do not introduce unneeded white-space
changes.

In short, most code changes should either bring new features or better performance. We want to avoid unmotivated code changes.


Specific rules
----------

We have few hard rules, but we have some:

- Printing to standard output or standard error (`stderr`, `stdout`, `std::cerr`, `std::cout`) in the core library is forbidden. This follows from the [Writing R Extensions](https://cran.r-project.org/doc/manuals/R-exts.html) manual which states that "Compiled code should not write to stdout or stderr".
- Calls to `abort()` are forbidden in the core library. This follows from the [Writing R Extensions](https://cran.r-project.org/doc/manuals/R-exts.html) manual which states that "Under no circumstances should your compiled code ever call abort or exit".
- All source code files (.h, .cpp) must be ASCII.
- All C macros introduced in public headers need to be prefixed with either `SIMDJSON_` or `simdjson_`.
- We avoid trailing white space characters within lines. That is, your lines of code should not terminate with unnecessary spaces. Generally, please avoid making unnecessary changes to white-space characters when contributing code.

Tools, tests and benchmarks are not held to these same strict rules.

General Guidelines
----------

Contributors are encouraged to :

- Document their changes. Though we do not enforce a rule regarding code comments, we prefer that non-trivial algorithms and techniques be somewhat documented in the code.
- Follow as much as possible the existing code style. We do not enforce a specific code style, but we prefer consistency. We avoid contractions (isn't, aren't) in the comments.
- Modify as few lines of code as possible when working on an issue. The more lines you modify, the harder it is for your fellow human beings to understand what is going on.
- Tools may report "problems" with the code, but we never delegate programming to tools: if there is a problem with the code, we need to understand it. Thus we will not "fix" code merely to please a static analyzer.
- Provide tests for any new feature. We will not merge a new feature without tests.
- Run before/after benchmarks so that we can appreciate the effect of the changes on the performance.

Pull Requests
--------------

Pull requests are always invited. However, we ask that you follow these guidelines:

- It is wise to discuss your ideas first as part of an issue before you start coding. If you omit this step and code first, be prepared to have your code receive scrutiny and be dropped.
- Users should provide a rationale for their changes. Does it improve performance? Does it add a feature? Does it improve maintainability? Does it fix a bug? This must be explicitly stated as part of the pull request. Do not propose changes based on taste or intuition. We do not delegate programming to tools: that some tool suggested a code change is not reason enough to change the code.
1. When your code improves performance, please document the gains with a benchmark using hard numbers.
2. If your code fixes a bug, please either fix a failing test, or propose a new test.
3. Other types of changes must be clearly motivated. We openly discourage changes with no identifiable benefits.
- Changes should be focused and minimal. You should change as few lines of code as possible. Please do not reformat or touch files needlessly.
- New features must be accompanied by new tests, in general.
- Your code should pass our continuous-integration tests. It is your responsibility to ensure that your proposal pass the tests. We do not merge pull requests that would break our build.
- An exception to this would be changes to non-code files, such as documentation and assets, or trivial changes to code, such as comments, where it is encouraged to explicitly ask for skipping a CI run using the `[skip ci]` prefix in your Pull Request title **and** in the first line of the most recent commit in a push. Example for such a commit: `[skip ci] Fixed typo in power_of_ten's docs`
This benefits the project in such a way that the CI pipeline is not burdened by running jobs on changes that don't change any behavior in the code, which reduces wait times for other Pull Requests that do change behavior and require testing.

If the benefits of your proposed code remain unclear, we may choose to discard your code: that is not an insult, we frequently discard our own code. We may also consider various alternatives and choose another path. Again, that is not an insult or a sign that you have wasted your time.

Style
-----

Our formatting style is inspired by the LLVM style.
The simdjson library is written using the snake case: when a variable or a function is a phrase, each space is replaced by an underscore character, and the first letter of each word written in lowercase. Compile-time constants are written entirely in uppercase with the same underscore convention.

Code of Conduct
---------------

Though we do not have a formal code of conduct, we will not tolerate bullying, bigotry or
intimidation. Everyone is welcome to contribute. If you have concerns, you can raise them privately with the core team members (e.g., D. Lemire, J. Keiser).

We welcome contributions from women and less represented groups. If you need help, please reach out.

Consider the following points when engaging with the project:

- We discourage arguments from authority: ideas are discusssed on their own merits and not based on who stated it.
- Be mindful that what you may view as an aggression is maybe merely a difference of opinion or a misunderstanding.
- Be mindful that a collection of small aggressions, even if mild in isolation, can become harmful.

Getting Started Hacking
-----------------------

An overview of simdjson's directory structure, with pointers to architecture and design
considerations and other helpful notes, can be found at [HACKING.md](HACKING.md).
Loading

0 comments on commit a0f725b

Please sign in to comment.