ClickHouse/src/Storages/MergeTree/MergeTreeReadTask.cpp at master · ClickHouse/ClickHouse

History

445 lines (372 loc) · 17.2 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

#include <IO/Operators.h>

#include <Storages/MergeTree/IMergeTreeDataPart.h>

#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>

#include <Storages/MergeTree/MergeTreeBlockReadUtils.h>

#include <Storages/MergeTree/MergeTreeIndexText.h>

#include <Storages/MergeTree/MergeTreeReadTask.h>

#include <Storages/MergeTree/MergeTreeReaderIndex.h>

#include <Storages/MergeTree/MergeTreeSelectProcessor.h>

#include <Storages/MergeTree/MergeTreeVirtualColumns.h>

#include <Storages/MergeTree/PatchParts/MergeTreePatchReader.h>

#include <Common/Exception.h>

#include <Common/ZooKeeper/ZooKeeperCommon.h>

#include <Processors/Transforms/LazyMaterializingTransform.h>

#include <Processors/QueryPlan/Optimizations/RuntimeDataflowStatistics.h>

namespace DB

{

namespace ErrorCodes

{

extern const int LOGICAL_ERROR;

}

String MergeTreeReadTaskColumns::dump() const

{

WriteBufferFromOwnString s;

for (size_t i = 0; i < pre_columns.size(); ++i)

s << "STEP " << i << ":\n" << pre_columns[i].toString() << "\n";

s << "MAIN:\n" << columns.toString() << "\n";

for (size_t i = 0; i < patch_columns.size(); ++i)

s << "PATCH " << i << ":\n" << patch_columns[i].toString() << "\n";

return s.str();

}

Names MergeTreeReadTaskColumns::getAllColumnNames() const

{

Names res;

for (const auto & step_columns : pre_columns)

{

for (const auto & column : step_columns)

res.push_back(column.name);

}

for (const auto & column : columns)

res.push_back(column.name);

return res;

}

void MergeTreeReadTaskColumns::moveAllColumnsFromPrewhere()

{

for (auto & step_columns : pre_columns)

columns.splice(columns.end(), std::move(step_columns));

pre_columns.clear();

}

void MergeTreeReadTask::Readers::updateAllMarkRanges(const MarkRanges & ranges)

{

main->updateAllMarkRanges(ranges);

for (auto & reader : prewhere)

reader->updateAllMarkRanges(ranges);

}

MergeTreeReadTask::MergeTreeReadTask(

MergeTreeReadTaskInfoPtr info_,

Readers readers_,

MarkRanges mark_ranges_,

std::vector<MarkRanges> patches_mark_ranges_,

const BlockSizeParams & block_size_params_,

MergeTreeBlockSizePredictorPtr size_predictor_,

RuntimeDataflowStatisticsCacheUpdaterPtr updater_)

: info(std::move(info_))

, readers(std::move(readers_))

, mark_ranges(std::move(mark_ranges_))

, patches_mark_ranges(std::move(patches_mark_ranges_))

, block_size_params(block_size_params_)

, size_predictor(std::move(size_predictor_))

, updater(std::move(updater_))

{

if (updater)

{

dataflow_cache_update_cb

= [&](const ColumnsWithTypeAndName & columns, size_t read_bytes, std::optional<bool> & should_continue_sampling) -> void

{

chassert(updater);

const auto & part_columns = info->data_part->getColumns();

auto column_sizes = info->data_part->getColumnSizes();

updater->recordInputColumns(columns, part_columns, *column_sizes, read_bytes, should_continue_sampling);

};

}

/// Returns pointer to the index if all columns in the read step belongs to the read step for that index.

static const IndexReadTask * getIndexReadTaskForReadStep(const IndexReadTasks & index_read_tasks, const NamesAndTypesList & columns_to_read)

{

if (index_read_tasks.empty())

return nullptr;

std::unordered_map<String, String> column_to_index;

for (const auto & [index_name, index_task] : index_read_tasks)

{

for (const auto & column : index_task.columns)

column_to_index[column.name] = index_name;

}

String index_for_step;

bool has_non_index_columns = false;

for (const auto & column : columns_to_read)

{

auto it = column_to_index.find(column.name);

if (it == column_to_index.end())

{

has_non_index_columns = true;

}

else if (index_for_step.empty())

{

index_for_step = it->second;

}

else if (index_for_step != it->second)

{

throw Exception(ErrorCodes::LOGICAL_ERROR, "Found columns for multiple indexes ({} and {}) in one read step", index_for_step, it->second);

}

/// Allow mixing index columns with regular columns when the regular columns are dependencies

/// for evaluating default expressions of text index virtual columns (e.g., for partially materialized text indexes).

/// In this case, don't return an index task - let the main reader handle all columns.

/// The main reader will evaluate the default expression and fill the virtual column.

if (!index_for_step.empty() && has_non_index_columns)

return nullptr;

return index_for_step.empty() ? nullptr : &index_read_tasks.at(index_for_step);

}

MergeTreeReadTask::Readers MergeTreeReadTask::createReaders(

const MergeTreeReadTaskInfoPtr & read_info,

const Extras & extras,

const MarkRanges & ranges,

const std::vector<MarkRanges> & patches_ranges)

{

Readers new_readers;

auto create_reader = [&](const NamesAndTypesList & columns_to_read, bool is_prewhere)

{

auto part_info = std::make_shared<LoadedMergeTreeDataPartInfoForReader>(read_info->data_part, read_info->alter_conversions);

return createMergeTreeReader(

part_info,

columns_to_read,

extras.storage_snapshot,

read_info->data_part->storage.getSettings(),

ranges,

read_info->const_virtual_fields,

extras.uncompressed_cache,

extras.mark_cache,

is_prewhere ? nullptr : read_info->deserialization_prefixes_cache.get(),

extras.reader_settings,

extras.value_size_map,

extras.profile_callback);

};

new_readers.main = create_reader(read_info->task_columns.columns, false);

bool is_vector_search = read_info->read_hints.vector_search_results.has_value();

if (is_vector_search)

new_readers.main->data_part_info_for_read->setReadHints(read_info->read_hints, read_info->task_columns.columns);

for (const auto & pre_columns_per_step : read_info->task_columns.pre_columns)

{

if (const auto * index_read_task = getIndexReadTaskForReadStep(read_info->index_read_tasks, pre_columns_per_step))

{

/// Do not skip marks for queries with FINAL in the reader,

/// because it may affect the result of the merging algorithm.

bool can_skip_marks = !index_read_task->is_final;

new_readers.prewhere.push_back(createMergeTreeReaderIndex(

new_readers.main.get(),

index_read_task->index,

pre_columns_per_step,

can_skip_marks));

}

else

{

new_readers.prewhere.push_back(create_reader(pre_columns_per_step, true));

}

if (is_vector_search)

new_readers.prewhere.back()->data_part_info_for_read->setReadHints(read_info->read_hints, pre_columns_per_step);

}

auto create_patch_reader = [&](size_t part_idx)

{

return createMergeTreeReader(

read_info->patch_parts[part_idx].part,

read_info->task_columns.patch_columns[part_idx],

extras.storage_snapshot,

read_info->data_part->storage.getSettings(),

patches_ranges[part_idx],

read_info->const_virtual_fields,

extras.uncompressed_cache,

extras.mark_cache,

/*deserialization_prefixes_cache=*/ nullptr,

extras.reader_settings,

extras.value_size_map,

extras.profile_callback);

};

for (size_t i = 0; i < read_info->patch_parts.size(); ++i)

{

new_readers.patches.push_back(getPatchReader(

read_info->patch_parts[i],

create_patch_reader(i),

extras.patch_join_cache));

}

return new_readers;

}

MergeTreeReadersChain MergeTreeReadTask::createReadersChain(

const Readers & task_readers,

const PrewhereExprInfo & prewhere_actions,

const ReadStepsPerformanceCounters & read_steps_performance_counters)

{

if (prewhere_actions.steps.size() != task_readers.prewhere.size())

{

throw Exception(

ErrorCodes::LOGICAL_ERROR,

"PREWHERE steps count mismatch, actions: {}, readers: {}",

prewhere_actions.steps.size(), task_readers.prewhere.size());

}

std::vector<MergeTreeRangeReader> range_readers;

size_t num_readers = prewhere_actions.steps.size() + task_readers.prewhere.size() + 1;

range_readers.reserve(num_readers);

/// Compute a combined flag: true only if ALL readers in the chain support incomplete granules.

/// This ensures that the first reader in the chain (which decides batch boundaries) does not

/// create mid-mark boundaries when a later reader cannot handle them.

bool can_read_incomplete_granules = task_readers.main->canReadIncompleteGranules()

&& std::ranges::all_of(task_readers.prewhere, [](const auto & reader)

{

return reader->canReadIncompleteGranules();

});

if (task_readers.prepared_index)

{

range_readers.emplace_back(

task_readers.prepared_index.get(),

Block{},

/*prewhere_info_=*/ nullptr,

read_steps_performance_counters.getCounterForIndexStep(),

/*main_reader_=*/ false,

can_read_incomplete_granules);

}

size_t counter_idx = 0;

for (size_t i = 0; i < prewhere_actions.steps.size(); ++i)

{

range_readers.emplace_back(

task_readers.prewhere[i].get(),

range_readers.empty() ? Block{} : range_readers.back().getSampleBlock(),

prewhere_actions.steps[i].get(),

read_steps_performance_counters.getCountersForStep(counter_idx++),

/*main_reader_=*/ false,

can_read_incomplete_granules);

}

if (!task_readers.main->getColumns().empty())

{

range_readers.emplace_back(

task_readers.main.get(),

range_readers.empty() ? Block{} : range_readers.back().getSampleBlock(),

/*prewhere_info_=*/ nullptr,

read_steps_performance_counters.getCountersForStep(counter_idx),

/*main_reader_=*/ true,

can_read_incomplete_granules);

}

return MergeTreeReadersChain{std::move(range_readers), task_readers.patches};

}

void MergeTreeReadTask::initializeReadersChain(

const PrewhereExprInfo & prewhere_actions,

MergeTreeIndexBuildContextPtr index_build_context,

LazyMaterializingRowsPtr lazy_materializing_rows,

const ReadStepsPerformanceCounters & read_steps_performance_counters)

{

if (readers_chain.isInitialized())

throw Exception(ErrorCodes::LOGICAL_ERROR, "Range readers chain is already initialized");

PrewhereExprInfo all_prewhere_actions;

if (index_build_context || lazy_materializing_rows)

initializeIndexReader(index_build_context, lazy_materializing_rows);

for (const auto & step : info->mutation_steps)

all_prewhere_actions.steps.push_back(step);

for (const auto & step : prewhere_actions.steps)

all_prewhere_actions.steps.push_back(step);

readers_chain = createReadersChain(readers, all_prewhere_actions, read_steps_performance_counters);

}

void MergeTreeReadTask::initializeIndexReader(const MergeTreeIndexBuildContextPtr & index_build_context, const LazyMaterializingRowsPtr & lazy_materializing_rows)

{

/// Optionally initialize the index filter for the current read task. If the build context exists and contains

/// relevant read ranges for the current part, retrieve or construct index filter for all involved skip indexes.

/// This filter will later be used to filter granules during the first reading step.

MergeTreeIndexReadResultPtr index_read_result;

if (index_build_context)

index_read_result = index_build_context->getPreparedIndexReadResult(*this);

const PaddedPODArray<UInt64> * part_rows = nullptr;

if (lazy_materializing_rows)

{

part_rows = &lazy_materializing_rows->rows_in_parts[getInfo().part_index_in_query];

// std::cerr << "Initialized index for part " << getInfo().part_index_in_query << " with " << part_rows->size() << " rows\n";

}

if (index_read_result || lazy_materializing_rows)

{

readers.prepared_index = std::make_unique<MergeTreeReaderIndex>(readers.main.get(), std::move(index_read_result), part_rows);

}

UInt64 MergeTreeReadTask::estimateNumRows() const

{

if (!size_predictor)

{

if (block_size_params.preferred_block_size_bytes)

throw Exception(ErrorCodes::LOGICAL_ERROR, "Size predictor is not set, it might lead to a performance degradation");

return static_cast<size_t>(block_size_params.max_block_size_rows);

}

/// Calculates number of rows will be read using preferred_block_size_bytes.

/// Can't be less than avg_index_granularity.

size_t rows_to_read = size_predictor->estimateNumRows(block_size_params.preferred_block_size_bytes);

if (!rows_to_read)

return rows_to_read;

auto total_row_in_current_granule = readers_chain.numRowsInCurrentGranule();

rows_to_read = std::max(total_row_in_current_granule, rows_to_read);

if (block_size_params.preferred_max_column_in_block_size_bytes)

{

/// Calculates number of rows will be read using preferred_max_column_in_block_size_bytes.

auto rows_to_read_for_max_size_column

= size_predictor->estimateNumRowsForMaxSizeColumn(block_size_params.preferred_max_column_in_block_size_bytes);

double filtration_ratio = std::max(block_size_params.min_filtration_ratio, 1.0 - size_predictor->filtered_rows_ratio);

auto rows_to_read_for_max_size_column_with_filtration

= static_cast<size_t>(static_cast<double>(rows_to_read_for_max_size_column) / filtration_ratio);

/// If preferred_max_column_in_block_size_bytes is used, number of rows to read can be less than current_index_granularity.

rows_to_read = std::min(rows_to_read, rows_to_read_for_max_size_column_with_filtration);

}

auto unread_rows_in_current_granule = readers_chain.numPendingRowsInCurrentGranule();

if (unread_rows_in_current_granule >= rows_to_read)

return rows_to_read;

const auto & index_granularity = info->data_part->index_granularity;

return index_granularity->countRowsForRows(readers_chain.currentMark(), rows_to_read, readers_chain.numReadRowsInCurrentGranule());

}

MergeTreeReadTask::BlockAndProgress MergeTreeReadTask::read()

{

auto component_guard = Coordination::setCurrentComponent("MergeTreeReadTask::read");

if (size_predictor)

size_predictor->startBlock();

UInt64 recommended_rows = estimateNumRows();

UInt64 rows_to_read = std::max(static_cast<UInt64>(1), std::min(block_size_params.max_block_size_rows, recommended_rows));

auto read_result = readers_chain.read(rows_to_read, mark_ranges, patches_mark_ranges, dataflow_cache_update_cb);

/// All rows were filtered. Repeat.

if (read_result.num_rows == 0)

read_result.columns.clear();

const auto & sample_block = readers_chain.getSampleBlock();

if (read_result.num_rows != 0 && sample_block.columns() != read_result.columns.size())

throw Exception(ErrorCodes::LOGICAL_ERROR, "Inconsistent number of columns got from MergeTreeRangeReader. "

"Have {} in sample block and {} columns in list",

toString(sample_block.columns()), toString(read_result.columns.size()));

/// TODO: check columns have the same types as in header.

UInt64 num_filtered_rows = read_result.numReadRows() - read_result.num_rows;

size_t num_read_rows = read_result.numReadRows();

size_t num_read_bytes = read_result.numBytesRead();

if (size_predictor)

{

size_predictor->updateFilteredRowsRation(read_result.numReadRows(), num_filtered_rows);

if (!read_result.columns.empty())

size_predictor->update(sample_block, read_result.columns, read_result.num_rows);

}

Block block;

if (read_result.num_rows != 0)

{

for (auto & column : read_result.columns)

{

/// We may have columns that have other references, usually it is a constant column that has been created during analysis

/// (that will not be const here anymore, i.e. after materialize()). The contract is - not to shrink if column is shared.

/// But if some subcolumns are shared, we'll clone them via IColumn::mutate() and then safely shrink

if (column->use_count() == 1)

{

auto mutable_column = IColumn::mutate(std::move(column));

mutable_column->shrinkToFit();

column = std::move(mutable_column);

}

block = sample_block.cloneWithColumns(read_result.columns);

}

BlockAndProgress res = {

.block = std::move(block),

.read_mark_ranges = read_result.read_mark_ranges,

.row_count = read_result.num_rows,

.num_read_rows = num_read_rows,

.num_read_bytes = num_read_bytes };

return res;

}

void MergeTreeReadTask::addPrewhereUnmatchedMarks(const MarkRanges & mark_ranges_)

{

prewhere_unmatched_marks.insert(prewhere_unmatched_marks.end(), mark_ranges_.begin(), mark_ranges_.end());

}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

MergeTreeReadTask.cpp

Latest commit

History

MergeTreeReadTask.cpp

File metadata and controls