lwt: Process lwt request on a owning shard

gleb-cloudius · gleb-cloudius · commit d28dd4957b93 · 2020-01-13T10:26:02.000+02:00
LWT is much more efficient if a request is processed on a shard that owns
a token for the request. This is because otherwise the processing will
bounce to an owning shard multiple times. The patch proposes a way to
move request to correct shard before running lwt.  It works by returning
an error from lwt code if a shard is incorrect one specifying the shard
the request should be moved to. The error is processed by transport code
that jumps to a correct shard and re-process incoming message there.
diff --git a/auth/service.hh b/auth/service.hh
@@ -28,6 +28,7 @@
 #include <seastar/core/future.hh>
 #include <seastar/core/sstring.hh>
 #include <seastar/util/bool_class.hh>
+#include <seastar/core/sharded.hh>
 
 #include "auth/authenticator.hh"
 #include "auth/authorizer.hh"
@@ -76,7 +77,9 @@ public:
 ///
 /// All state associated with access-control is stored externally to any particular instance of this class.
 ///
-class service final {
+/// peering_sharded_service inheritance is needed to be able to access shard local authentication service
+/// given an object from another shard. Used for bouncing lwt requests to correct shard.
+class service final : public seastar::peering_sharded_service<service> {
     permissions_cache_config _permissions_cache_config;
     std::unique_ptr<permissions_cache> _permissions_cache;
 
diff --git a/cql3/statements/batch_statement.cc b/cql3/statements/batch_statement.cc
@@ -377,6 +377,12 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::exe
         throw exceptions::invalid_request_exception(format("Unrestricted partition key in a conditional BATCH"));
     }
 
+    auto shard = service::storage_proxy::cas_shard(request->key()[0].start()->value().as_decorated_key().token());
+    if (shard != engine().cpu_id()) {
+        return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(
+                make_shared<cql_transport::messages::result_message::bounce_to_shard>(shard));
+    }
+
     return proxy.cas(schema, request, request->read_command(), request->key(),
             {read_timeout, qs.get_permit(), qs.get_client_state(), qs.get_trace_state()},
             cl_for_paxos, cl_for_commit, batch_timeout, cas_timeout).then([this, request] (bool is_applied) {
diff --git a/cql3/statements/modification_statement.cc b/cql3/statements/modification_statement.cc
@@ -373,6 +373,12 @@ modification_statement::execute_with_condition(service::storage_proxy& proxy, se
     // modification in the list of CAS commands, since we're handling single-statement execution.
     request->add_row_update(*this, std::move(ranges), std::move(json_cache), options);
 
+    auto shard = service::storage_proxy::cas_shard(request->key()[0].start()->value().as_decorated_key().token());
+    if (shard != engine().cpu_id()) {
+        return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(
+                make_shared<cql_transport::messages::result_message::bounce_to_shard>(shard));
+    }
+
     return proxy.cas(s, request, request->read_command(), request->key(),
             {read_timeout, qs.get_permit(), qs.get_client_state(), qs.get_trace_state()},
             cl_for_paxos, cl_for_commit, statement_timeout, cas_timeout).then([this, request] (bool is_applied) {
diff --git a/cql3/statements/select_statement.cc b/cql3/statements/select_statement.cc
@@ -332,6 +332,14 @@ select_statement::do_execute(service::storage_proxy& proxy,
 
     auto key_ranges = _restrictions->get_partition_key_ranges(options);
 
+    if (db::is_serial_consistency(options.get_consistency())) {
+        unsigned shard = dht::shard_of(key_ranges[0].start()->value().as_decorated_key().token());
+        if (engine().cpu_id() != shard) {
+            return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(
+                    make_shared<cql_transport::messages::result_message::bounce_to_shard>(shard));
+        }
+    }
+
     if (!aggregate && !restrictions_need_filtering && (page_size <= 0
             || !service::pager::query_pagers::may_need_paging(*_schema, page_size,
                     *command, key_ranges))) {
diff --git a/service/client_state.hh b/service/client_state.hh
@@ -68,6 +68,29 @@ public:
         UNINITIALIZED, AUTHENTICATION, READY
     };
 
+    // This class is used to move client_state between shards
+    // It is created on a shard that owns client_state than passed
+    // to a target shard where client_state_for_another_shard::get()
+    // can be called to obtain a shard local copy.
+    class client_state_for_another_shard {
+    private:
+        const client_state* _cs;
+        tracing::global_trace_state_ptr _trace_state;
+        seastar::sharded<auth::service>* _auth_service;
+        client_state_for_another_shard(const client_state* cs, tracing::global_trace_state_ptr gt,
+                seastar::sharded<auth::service>* auth_service) : _cs(cs), _trace_state(gt), _auth_service(auth_service) {}
+        friend client_state;
+    public:
+        client_state get() const {
+            return client_state(_cs, _trace_state, _auth_service);
+        }
+    };
+private:
+    client_state(const client_state* cs, tracing::global_trace_state_ptr gt, seastar::sharded<auth::service>* auth_service)
+            : _keyspace(cs->_keyspace),  _trace_state_ptr(gt), _user(cs->_user), _auth_state(cs->_auth_state),
+              _is_internal(cs->_is_internal), _is_thrift(cs->_is_thrift), _remote_address(cs->_remote_address),
+              _auth_service(auth_service ? &auth_service->local() : nullptr) {}
+    friend client_state_for_another_shard;
 private:
     sstring _keyspace;
     tracing::trace_state_ptr _trace_state_ptr;
@@ -155,7 +178,8 @@ public:
             , _is_thrift(false)
     {}
 
-    client_state(client_state&) = delete;
+    client_state(const client_state&) = delete;
+    client_state(client_state&&) = default;
 
     ///
     /// `nullptr` for internal instances.
@@ -315,6 +339,10 @@ public:
         return _user;
     }
 
+    client_state_for_another_shard move_to_other_shard() {
+        return client_state_for_another_shard(this, _trace_state_ptr, _auth_service ? &_auth_service->container() : nullptr);
+    }
+
 #if 0
     public static SemanticVersion[] getCQLSupportedVersion()
     {
diff --git a/service/query_state.hh b/service/query_state.hh
@@ -42,6 +42,12 @@ public:
         , _permit(std::move(permit))
     { }
 
+    query_state(client_state& client_state, tracing::trace_state_ptr trace_state_ptr, service_permit permit)
+        : _client_state(client_state)
+        , _trace_state_ptr(std::move(trace_state_ptr))
+        , _permit(std::move(permit))
+    { }
+
     const tracing::trace_state_ptr& get_trace_state() const {
         return _trace_state_ptr;
     }
diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
@@ -137,6 +137,10 @@ sstring get_local_dc() {
     return get_dc(local_addr);
 }
 
+unsigned storage_proxy::cas_shard(dht::token token) {
+    return dht::shard_of(token);
+}
+
 class mutation_holder {
 protected:
     size_t _size = 0;
@@ -3969,6 +3973,7 @@ storage_proxy::do_query(schema_ptr s,
     }
 }
 
+// WARNING: the function should be called on a shard that owns the key that is been read
 future<storage_proxy::coordinator_query_result>
 storage_proxy::do_query_with_paxos(schema_ptr s,
     lw_shared_ptr<query::read_command> cmd,
@@ -3983,6 +3988,9 @@ storage_proxy::do_query_with_paxos(schema_ptr s,
     auto cl_for_learn = cl == db::consistency_level::LOCAL_SERIAL ? db::consistency_level::LOCAL_QUORUM :
             db::consistency_level::QUORUM;
 
+    if (cas_shard(partition_ranges[0].start()->value().as_decorated_key().token()) != engine().cpu_id()) {
+        throw std::logic_error("storage_proxy::do_query_with_paxos called on a wrong shard");
+    }
     // All cas networking operations run with query provided timeout
     db::timeout_clock::time_point timeout = query_options.timeout(*this);
     // When to give up due to contention
@@ -4074,6 +4082,8 @@ storage_proxy::do_query_with_paxos(schema_ptr s,
  * Note that since we are performing a CAS rather than a simple update, we perform a read (of committed
  * values) between the prepare and accept phases. This gives us a slightly longer window for another
  * coordinator to come along and trump our own promise with a newer one but is otherwise safe.
+ *
+ * WARNING: the function should be called on a shard that owns the key cas() operates on
  */
 future<bool> storage_proxy::cas(schema_ptr schema, shared_ptr<cas_request> request, lw_shared_ptr<query::read_command> cmd,
         dht::partition_range_vector&& partition_ranges, storage_proxy::coordinator_query_options query_options,
@@ -4086,6 +4096,10 @@ future<bool> storage_proxy::cas(schema_ptr schema, shared_ptr<cas_request> reque
     db::validate_for_cas(cl_for_paxos);
     db::validate_for_cas_commit(cl_for_commit, schema->ks_name());
 
+    if (cas_shard(partition_ranges[0].start()->value().as_decorated_key().token()) != engine().cpu_id()) {
+        throw std::logic_error("storage_proxy::cas called on a wrong shard");
+    }
+
     shared_ptr<paxos_response_handler> handler;
     try {
         handler = seastar::make_shared<paxos_response_handler>(shared_from_this(),
diff --git a/service/storage_proxy.hh b/service/storage_proxy.hh
@@ -532,6 +532,8 @@ public:
         return _stats;
     }
 
+    static unsigned cas_shard(dht::token token);
+
     virtual void on_join_cluster(const gms::inet_address& endpoint) override;
     virtual void on_leave_cluster(const gms::inet_address& endpoint) override;
     virtual void on_up(const gms::inet_address& endpoint) override;
diff --git a/service/storage_service.cc b/service/storage_service.cc
@@ -2236,6 +2236,9 @@ future<> storage_service::start_native_transport() {
             cql_server_config.max_request_size = ss._service_memory_total;
             cql_server_config.get_service_memory_limiter_semaphore = [ss = std::ref(get_storage_service())] () -> semaphore& { return ss.get().local()._service_memory_limiter; };
             cql_server_config.allow_shard_aware_drivers = cfg.enable_shard_aware_drivers();
+            smp_service_group_config cql_server_smp_service_group_config;
+            cql_server_smp_service_group_config.max_nonlocal_requests = 5000;
+            cql_server_config.bounce_request_smp_service_group = create_smp_service_group(cql_server_smp_service_group_config).get0();
             seastar::net::inet_address ip = gms::inet_address::lookup(addr, family, preferred).get0();
             cserver->start(std::ref(cql3::get_query_processor()), std::ref(ss._auth_service), std::ref(ss._cql_config), cql_server_config).get();
             struct listen_cfg {
diff --git a/test/boost/cql_query_test.cc b/test/boost/cql_query_test.cc
@@ -1417,6 +1417,7 @@ SEASTAR_TEST_CASE(test_functions) {
                         res.push_back(rw[0]);
                     }
                 }
+                virtual void visit(const result_message::bounce_to_shard& rows) override { throw "bad"; }
             };
             validator v;
             msg->accept(v);
diff --git a/thrift/handler.cc b/thrift/handler.cc
@@ -962,6 +962,9 @@ class thrift_handler : public CassandraCobSvIf {
         virtual void visit(const cql_transport::messages::result_message::rows& m) override {
             _result = to_thrift_result(m.rs());
         }
+        virtual void visit(const cql_transport::messages::result_message::bounce_to_shard& m) override {
+            throw TProtocolException(TProtocolException::TProtocolExceptionType::NOT_IMPLEMENTED, "Thrift does not support executing LWT statements");
+        }
     };
 
     void execute_cql3_query(thrift_fn::function<void(CqlResult const& _return)> cob, thrift_fn::function<void(::apache::thrift::TDelayedException* _throw)> exn_cob, const std::string& query, const Compression::type compression, const ConsistencyLevel::type consistency) {
diff --git a/transport/messages/result_message.cc b/transport/messages/result_message.cc
@@ -29,6 +29,10 @@ std::ostream& operator<<(std::ostream& os, const result_message::void_message& m
     return fmt_print(os, "{{result_message::void}}");
 }
 
+std::ostream& operator<<(std::ostream& os, const result_message::bounce_to_shard& msg) {
+    return fmt_print(os, "{{result_message::bounce_to_shard {}}}", msg.move_to_shard());
+}
+
 std::ostream& operator<<(std::ostream& os, const result_message::set_keyspace& msg) {
     return fmt_print(os, "{{result_message::set_keyspace {}}}", msg.get_keyspace());
 }
@@ -80,6 +84,7 @@ std::ostream& operator<<(std::ostream& os, const result_message& msg) {
         void visit(const result_message::prepared::thrift& m) override { _os << m; };
         void visit(const result_message::schema_change& m) override { _os << m; };
         void visit(const result_message::rows& m) override { _os << m; };
+        void visit(const result_message::bounce_to_shard& m) override { _os << m; };
     };
     visitor print_visitor{os};
     msg.accept(print_visitor);
diff --git a/transport/messages/result_message.hh b/transport/messages/result_message.hh
@@ -75,6 +75,7 @@ public:
     virtual void visit(const result_message::prepared::thrift&) = 0;
     virtual void visit(const result_message::schema_change&) = 0;
     virtual void visit(const result_message::rows&) = 0;
+    virtual void visit(const result_message::bounce_to_shard&) = 0;
 };
 
 class result_message::visitor_base : public visitor {
@@ -85,6 +86,7 @@ public:
     void visit(const result_message::prepared::thrift&) override {};
     void visit(const result_message::schema_change&) override {};
     void visit(const result_message::rows&) override {};
+    void visit(const result_message::bounce_to_shard&) override { assert(false); };
 };
 
 class result_message::void_message : public result_message {
@@ -96,6 +98,24 @@ public:
 
 std::ostream& operator<<(std::ostream& os, const result_message::void_message& msg);
 
+// This result is handled internally and should never be returned
+// to a client. Any visitor should abort while handling it since
+// it is a sure sign of a error.
+class result_message::bounce_to_shard : public result_message {
+    unsigned _shard;
+public:
+    bounce_to_shard(unsigned shard) : _shard(shard) {}
+    virtual void accept(result_message::visitor& v) const override {
+        v.visit(*this);
+    }
+    virtual std::optional<unsigned> move_to_shard() const {
+        return _shard;
+    }
+
+};
+
+std::ostream& operator<<(std::ostream& os, const result_message::bounce_to_shard& msg);
+
 class result_message::set_keyspace : public result_message {
 private:
     sstring _keyspace;
diff --git a/transport/messages/result_message_base.hh b/transport/messages/result_message_base.hh
@@ -43,6 +43,9 @@ public:
         return _warnings;
     }
 
+    virtual std::optional<unsigned> move_to_shard() const {
+        return std::nullopt;
+    }
     //
     // Message types:
     //
@@ -51,6 +54,7 @@ public:
     class prepared;
     class schema_change;
     class rows;
+    class bounce_to_shard;
 };
 
 std::ostream& operator<<(std::ostream& os, const result_message& msg);
diff --git a/transport/request.hh b/transport/request.hh
@@ -69,6 +69,10 @@ public:
         , _linearization_buffer(&linearization_buffer)
     { }
 
+    fragmented_temporary_buffer::istream get_stream() {
+        return _in;
+    }
+
     size_t bytes_left() const {
         return _in.bytes_left();
     }
diff --git a/transport/server.cc b/transport/server.cc
diff --git a/transport/server.hh b/transport/server.hh

Original file line number	Diff line number	Diff line change
`@@ -532,6 +532,8 @@ public:`
`532`	`532`	`return _stats;`
`533`	`533`	`}`
`534`	`534`
	`535`	`+ static unsigned cas_shard(dht::token token);`
	`536`	`+`
`535`	`537`	`virtual void on_join_cluster(const gms::inet_address& endpoint) override;`
`536`	`538`	`virtual void on_leave_cluster(const gms::inet_address& endpoint) override;`
`537`	`539`	`virtual void on_up(const gms::inet_address& endpoint) override;`
Original file line number	Diff line number	Diff line change
`@@ -1417,6 +1417,7 @@ SEASTAR_TEST_CASE(test_functions) {`
`1417`	`1417`	`res.push_back(rw[0]);`
`1418`	`1418`	`}`
`1419`	`1419`	`}`
	`1420`	`+ virtual void visit(const result_message::bounce_to_shard& rows) override { throw "bad"; }`
`1420`	`1421`	`};`
`1421`	`1422`	`validator v;`
`1422`	`1423`	`msg->accept(v);`