-
Notifications
You must be signed in to change notification settings - Fork 8.3k
add compression method for files: Xz #16578
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c1abf5d
5982f9f
731e274
be2b002
495cd47
805bfd2
2ad01c5
ba6fa5d
8098f86
8ecf1d0
f9cebbf
986d13d
53a064b
87cc354
ceda5cb
73e5d28
268f289
6286775
f999ea2
124ef2f
9479052
55d05c9
1b06fd9
fe5800a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -36,6 +36,7 @@ add_subdirectory (murmurhash) | |||
| add_subdirectory (replxx-cmake) | ||||
| add_subdirectory (ryu-cmake) | ||||
| add_subdirectory (unixodbc-cmake) | ||||
| add_subdirectory (xz) | ||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like you've forget to add submodule
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not really, this is just to know where to fetch the submodule from. You need to run smth like this: And this will create a special file
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also to make fasttest update your submodule you need to add ClickHouse/docker/test/fasttest/run.sh Line 130 in f10a520
Without fasttest other tests won't be run
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also it's possible to disable |
||||
|
|
||||
| add_subdirectory (poco-cmake) | ||||
| add_subdirectory (croaring-cmake) | ||||
|
|
||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -330,6 +330,13 @@ if (ZSTD_LIBRARY) | |
| endif () | ||
| endif() | ||
|
|
||
| set (LZMA_LIBRARY liblzma) | ||
| set (LZMA_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/xz/src/liblzma/api) | ||
|
Comment on lines
+333
to
+334
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And I guess it is better to move this out into separate
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What should I do to pass description check?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Update description of PR and include changelog entry using specified format. Just copy it and modify the template, but leave the format.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it's much better to provide our own CMakeLists. CMake often cannot be safely reused (without building unneeded targets, polluting build options). |
||
| if (LZMA_LIBRARY) | ||
| target_link_libraries (clickhouse_common_io PUBLIC ${LZMA_LIBRARY}) | ||
| target_include_directories (clickhouse_common_io SYSTEM BEFORE PUBLIC ${LZMA_INCLUDE_DIR}) | ||
| endif() | ||
|
|
||
| if (USE_ICU) | ||
| dbms_target_link_libraries (PRIVATE ${ICU_LIBRARIES}) | ||
| dbms_target_include_directories (SYSTEM PRIVATE ${ICU_INCLUDE_DIRS}) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,11 +1,13 @@ | ||
| #include <IO/CompressionMethod.h> | ||
|
|
||
| #include <IO/BrotliReadBuffer.h> | ||
| #include <IO/BrotliWriteBuffer.h> | ||
| #include <IO/LZMADeflatingWriteBuffer.h> | ||
| #include <IO/LZMAInflatingReadBuffer.h> | ||
| #include <IO/ReadBuffer.h> | ||
| #include <IO/WriteBuffer.h> | ||
| #include <IO/ZlibInflatingReadBuffer.h> | ||
| #include <IO/ZlibDeflatingWriteBuffer.h> | ||
| #include <IO/BrotliReadBuffer.h> | ||
| #include <IO/BrotliWriteBuffer.h> | ||
| #include <IO/ZlibInflatingReadBuffer.h> | ||
|
|
||
| #if !defined(ARCADIA_BUILD) | ||
| # include <Common/config.h> | ||
|
|
@@ -14,7 +16,6 @@ | |
|
|
||
| namespace DB | ||
| { | ||
|
|
||
| namespace ErrorCodes | ||
| { | ||
| extern const int NOT_IMPLEMENTED; | ||
|
|
@@ -25,10 +26,16 @@ std::string toContentEncodingName(CompressionMethod method) | |
| { | ||
| switch (method) | ||
| { | ||
| case CompressionMethod::Gzip: return "gzip"; | ||
| case CompressionMethod::Zlib: return "deflate"; | ||
| case CompressionMethod::Brotli: return "br"; | ||
| case CompressionMethod::None: return ""; | ||
| case CompressionMethod::Gzip: | ||
| return "gzip"; | ||
| case CompressionMethod::Zlib: | ||
| return "deflate"; | ||
| case CompressionMethod::Brotli: | ||
| return "br"; | ||
| case CompressionMethod::Xz: | ||
| return "xz"; | ||
| case CompressionMethod::None: | ||
| return ""; | ||
| } | ||
| __builtin_unreachable(); | ||
| } | ||
|
|
@@ -52,27 +59,28 @@ CompressionMethod chooseCompressionMethod(const std::string & path, const std::s | |
| return CompressionMethod::Zlib; | ||
| if (*method_str == "brotli" || *method_str == "br") | ||
| return CompressionMethod::Brotli; | ||
| if (*method_str == "LZMA" || *method_str == "xz") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A small note...
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there the only place to remove LZMA? |
||
| return CompressionMethod::Xz; | ||
| if (hint.empty() || hint == "auto" || hint == "none") | ||
| return CompressionMethod::None; | ||
|
|
||
| throw Exception("Unknown compression method " + hint + ". Only 'auto', 'none', 'gzip', 'br' are supported as compression methods", | ||
| throw Exception( | ||
| "Unknown compression method " + hint + ". Only 'auto', 'none', 'gzip', 'br', 'xz' are supported as compression methods", | ||
| ErrorCodes::NOT_IMPLEMENTED); | ||
| } | ||
|
|
||
|
|
||
| std::unique_ptr<ReadBuffer> wrapReadBufferWithCompressionMethod( | ||
| std::unique_ptr<ReadBuffer> nested, | ||
| CompressionMethod method, | ||
| size_t buf_size, | ||
| char * existing_memory, | ||
| size_t alignment) | ||
| std::unique_ptr<ReadBuffer> nested, CompressionMethod method, size_t buf_size, char * existing_memory, size_t alignment) | ||
| { | ||
| if (method == CompressionMethod::Gzip || method == CompressionMethod::Zlib) | ||
| return std::make_unique<ZlibInflatingReadBuffer>(std::move(nested), method, buf_size, existing_memory, alignment); | ||
| #if USE_BROTLI | ||
| if (method == CompressionMethod::Brotli) | ||
| return std::make_unique<BrotliReadBuffer>(std::move(nested), buf_size, existing_memory, alignment); | ||
| #endif | ||
| if (method == CompressionMethod::Xz) | ||
| return std::make_unique<LZMAInflatingReadBuffer>(std::move(nested), buf_size, existing_memory, alignment); | ||
|
|
||
| if (method == CompressionMethod::None) | ||
| return nested; | ||
|
|
@@ -82,12 +90,7 @@ std::unique_ptr<ReadBuffer> wrapReadBufferWithCompressionMethod( | |
|
|
||
|
|
||
| std::unique_ptr<WriteBuffer> wrapWriteBufferWithCompressionMethod( | ||
| std::unique_ptr<WriteBuffer> nested, | ||
| CompressionMethod method, | ||
| int level, | ||
| size_t buf_size, | ||
| char * existing_memory, | ||
| size_t alignment) | ||
| std::unique_ptr<WriteBuffer> nested, CompressionMethod method, int level, size_t buf_size, char * existing_memory, size_t alignment) | ||
| { | ||
| if (method == DB::CompressionMethod::Gzip || method == CompressionMethod::Zlib) | ||
| return std::make_unique<ZlibDeflatingWriteBuffer>(std::move(nested), method, level, buf_size, existing_memory, alignment); | ||
|
|
@@ -96,6 +99,8 @@ std::unique_ptr<WriteBuffer> wrapWriteBufferWithCompressionMethod( | |
| if (method == DB::CompressionMethod::Brotli) | ||
| return std::make_unique<BrotliWriteBuffer>(std::move(nested), level, buf_size, existing_memory, alignment); | ||
| #endif | ||
| if (method == CompressionMethod::Xz) | ||
| return std::make_unique<LZMADeflatingWriteBuffer>(std::move(nested), level, buf_size, existing_memory, alignment); | ||
|
|
||
| if (method == CompressionMethod::None) | ||
| return nested; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,125 @@ | ||
| #include <IO/LZMADeflatingWriteBuffer.h> | ||
|
|
||
|
|
||
| namespace DB | ||
| { | ||
| namespace ErrorCodes | ||
| { | ||
| extern const int LZMA_STREAM_ENCODER_FAILED; | ||
| } | ||
|
|
||
| LZMADeflatingWriteBuffer::LZMADeflatingWriteBuffer( | ||
| std::unique_ptr<WriteBuffer> out_, int compression_level, size_t buf_size, char * existing_memory, size_t alignment) | ||
| : BufferWithOwnMemory<WriteBuffer>(buf_size, existing_memory, alignment), out(std::move(out_)) | ||
| { | ||
|
|
||
| lstr = LZMA_STREAM_INIT; | ||
| lstr.allocator = nullptr; | ||
| lstr.next_in = nullptr; | ||
| lstr.avail_in = 0; | ||
| lstr.next_out = nullptr; | ||
| lstr.avail_out = 0; | ||
|
|
||
| // options for further compression | ||
| lzma_options_lzma opt_lzma2; | ||
| if (lzma_lzma_preset(&opt_lzma2, compression_level)) | ||
| throw Exception(ErrorCodes::LZMA_STREAM_ENCODER_FAILED, "lzma preset failed: lzma version: {}", LZMA_VERSION_STRING); | ||
|
|
||
|
|
||
| // LZMA_FILTER_X86 - | ||
| // LZMA2 - codec for *.xz files compression; LZMA is not suitable for this purpose | ||
| // VLI - variable length integer (in *.xz most integers encoded as VLI) | ||
| // LZMA_VLI_UNKNOWN (UINT64_MAX) - VLI value to denote that the value is unknown | ||
| lzma_filter filters[] = { | ||
| {.id = LZMA_FILTER_X86, .options = nullptr}, | ||
| {.id = LZMA_FILTER_LZMA2, .options = &opt_lzma2}, | ||
| {.id = LZMA_VLI_UNKNOWN, .options = nullptr}, | ||
| }; | ||
| lzma_ret ret = lzma_stream_encoder(&lstr, filters, LZMA_CHECK_CRC64); | ||
|
|
||
| if (ret != LZMA_OK) | ||
| throw Exception( | ||
| ErrorCodes::LZMA_STREAM_ENCODER_FAILED, | ||
| "lzma stream encoder init failed: error code: {} lzma version: {}", | ||
| ret, | ||
| LZMA_VERSION_STRING); | ||
| } | ||
|
|
||
| LZMADeflatingWriteBuffer::~LZMADeflatingWriteBuffer() | ||
| { | ||
| try | ||
| { | ||
| finish(); | ||
|
|
||
| lzma_end(&lstr); | ||
| } | ||
| catch (...) | ||
| { | ||
| tryLogCurrentException(__PRETTY_FUNCTION__); | ||
| } | ||
| } | ||
|
|
||
| void LZMADeflatingWriteBuffer::nextImpl() | ||
| { | ||
| if (!offset()) | ||
| return; | ||
|
|
||
| lstr.next_in = reinterpret_cast<unsigned char *>(working_buffer.begin()); | ||
| lstr.avail_in = offset(); | ||
|
|
||
| lzma_action action = LZMA_RUN; | ||
| do | ||
| { | ||
| out->nextIfAtEnd(); | ||
| lstr.next_out = reinterpret_cast<unsigned char *>(out->position()); | ||
| lstr.avail_out = out->buffer().end() - out->position(); | ||
|
|
||
| lzma_ret ret = lzma_code(&lstr, action); | ||
| out->position() = out->buffer().end() - lstr.avail_out; | ||
|
|
||
| if (ret == LZMA_STREAM_END) | ||
| return; | ||
|
|
||
| if (ret != LZMA_OK) | ||
| throw Exception( | ||
| ErrorCodes::LZMA_STREAM_ENCODER_FAILED, | ||
| "lzma stream encoding failed: error code: {}; lzma_version: {}", | ||
| ret, | ||
| LZMA_VERSION_STRING); | ||
|
|
||
| } while (lstr.avail_in > 0 || lstr.avail_out == 0); | ||
| } | ||
|
|
||
|
|
||
| void LZMADeflatingWriteBuffer::finish() | ||
| { | ||
| if (finished) | ||
| return; | ||
|
|
||
| next(); | ||
|
|
||
| do | ||
| { | ||
| out->nextIfAtEnd(); | ||
| lstr.next_out = reinterpret_cast<unsigned char *>(out->position()); | ||
| lstr.avail_out = out->buffer().end() - out->position(); | ||
|
|
||
| lzma_ret ret = lzma_code(&lstr, LZMA_FINISH); | ||
| out->position() = out->buffer().end() - lstr.avail_out; | ||
|
|
||
| if (ret == LZMA_STREAM_END) | ||
| { | ||
| finished = true; | ||
| return; | ||
| } | ||
|
|
||
| if (ret != LZMA_OK) | ||
| throw Exception( | ||
| ErrorCodes::LZMA_STREAM_ENCODER_FAILED, | ||
| "lzma stream encoding failed: error code: {}; lzma version: {}", | ||
| ret, | ||
| LZMA_VERSION_STRING); | ||
|
|
||
| } while (lstr.avail_out == 0); | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| #pragma once | ||
|
|
||
| #include <IO/BufferWithOwnMemory.h> | ||
| #include <IO/WriteBuffer.h> | ||
|
|
||
| #include <lzma.h> | ||
|
|
||
| namespace DB | ||
| { | ||
| /// Performs compression using lzma library and writes compressed data to out_ WriteBuffer. | ||
| class LZMADeflatingWriteBuffer : public BufferWithOwnMemory<WriteBuffer> | ||
| { | ||
| public: | ||
| LZMADeflatingWriteBuffer( | ||
| std::unique_ptr<WriteBuffer> out_, | ||
| int compression_level, | ||
| size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, | ||
| char * existing_memory = nullptr, | ||
| size_t alignment = 0); | ||
|
|
||
| void finish(); | ||
|
|
||
| ~LZMADeflatingWriteBuffer() override; | ||
|
|
||
| private: | ||
| void nextImpl() override; | ||
|
|
||
| std::unique_ptr<WriteBuffer> out; | ||
| lzma_stream lstr; | ||
| bool finished = false; | ||
| }; | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Uses spaces for indent, while all other lines uses tabs (since
git submodule adduses them)