drop table if exists src;
drop table if exists dst;
drop table if exists dst_local;
create table dst_local engine=MergeTree ORDER BY tuple() AS system.numbers;
create table dst engine=Distributed('test_cluster_two_shards', currentDatabase(), dst_local, rand());
create table src engine=Distributed('test_cluster_two_shards', system, numbers_mt, rand());
set parallel_distributed_insert_select=2, prefer_localhost_replica=1;
/* select from number_mt used as a trivial illustration of the query which can be much faster when multithreaded */
insert into dst select number from src where not(sleepEachRow(0.001)) limit 10000 SETTINGS max_block_size = 1, max_threads=50;
Here the local subquery will be executed in ~10 seconds. But it you just run the select with the same settings it will finish in ~0.4 seconds. Setting max_insert_threads does not help.