Skip to content

Commit f4b21d9

Browse files
committed
alternator, tablets: improve Alternator LSI tablets test
The test test_tablet_alternator_lsi_consistency, checking that Alternator LSI allow strongly-consistent reads even with tablets, used a large cluster (10 nodes), to improve the chance of reaching an "unlucky" tablet placement - and even then only failed in about half the runs without the code fixed. In this patch, we rewrite the test using a much more reliable approach: We start only two nodes, and force the base's tablet onto one node, and the view's table onto the second node. This ensures with 100% certainty that the view update is remote, and the new test fails every single time before the code fix (I reverted the fix to verify) - and passes after it. The new test is not only more reliable, it's also significantly faster because it doesn't need to start a 10-node cluster. Signed-off-by: Nadav Har'El <[email protected]>
1 parent 6baf178 commit f4b21d9

1 file changed

Lines changed: 60 additions & 42 deletions

File tree

test/topology_experimental_raft/test_mv_tablets.py

Lines changed: 60 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,15 @@
2020

2121
logger = logging.getLogger(__name__)
2222

23+
async def get_table_id(cql, keyspace_name, table_or_view_name):
24+
# "table_or_view_name" may be either a table or a view, and those are
25+
# listed in different system tables so we may need to search both:
26+
matches = list(await cql.run_async(f"select id from system_schema.tables where keyspace_name = '{keyspace_name}' and table_name = '{table_or_view_name}'"))
27+
if not matches:
28+
matches = list(await cql.run_async(f"select id from system_schema.views where keyspace_name = '{keyspace_name}' and view_name = '{table_or_view_name}'"))
29+
assert len(matches) == 1
30+
return matches[0].id
31+
2332
# This convenience function takes the name of a table or a view, and a token,
2433
# and returns the list of host_id,shard pairs holding tablets for this token
2534
# and view.
@@ -31,13 +40,7 @@ async def get_tablet_replicas(manager: ManagerClient, server: ServerInfo, keyspa
3140
host = (await wait_for_cql_and_get_hosts(manager.cql, [server], time.time() + 60))[0]
3241
await read_barrier(manager.cql, host)
3342

34-
# "table_or_view_name" may be either a table or a view, and those are
35-
# listed in different system tables so we may need to search both:
36-
matches = list(await manager.cql.run_async(f"select id from system_schema.tables where keyspace_name = '{keyspace_name}' and table_name = '{table_or_view_name}'"))
37-
if not matches:
38-
matches = list(await manager.cql.run_async(f"select id from system_schema.views where keyspace_name = '{keyspace_name}' and view_name = '{table_or_view_name}'"))
39-
assert len(matches) == 1
40-
table_id = matches[0].id
43+
table_id = await get_table_id(manager.cql, keyspace_name, table_or_view_name)
4144

4245
rows = await manager.cql.run_async(f"SELECT last_token, replicas FROM system.tablets where "
4346
f"keyspace_name = '{keyspace_name}' and "
@@ -75,6 +78,14 @@ async def pin_the_only_tablet(manager, keyspace_name, table_or_view_name, server
7578
# it will propagate it to all of them.
7679
await manager.api.move_tablet(server.ip_addr, keyspace_name, table_or_view_name, source_host_id, source_shard, target_host_id, target_shard, tablet_token)
7780

81+
# Assert that the given table uses tablets, and has only one. It helps
82+
# verify that a test that attempted to enable tablets - and set up only
83+
# one tablet for the entire table - actually succeeded in doing that.
84+
async def assert_one_tablet(cql, keyspace_name, table_or_view_name):
85+
table_id = await get_table_id(cql, keyspace_name, table_or_view_name)
86+
rows = await cql.run_async(f"SELECT last_token, replicas FROM system.tablets where keyspace_name = '{keyspace_name}' and table_id = {table_id}")
87+
assert len(rows) == 1
88+
7889

7990
@pytest.mark.asyncio
8091
async def test_tablet_mv_create(manager: ManagerClient):
@@ -180,24 +191,25 @@ async def test_tablet_alternator_lsi_consistency(manager: ManagerClient):
180191
"""A reproducer for a bug where Alternator LSI was not using synchronous
181192
view updates when tablets are enabled, which could cause strongly-
182193
consistent read of the LSI to miss the data just written to the base.
183-
We use a fairly large cluster to increase the chance that the tablet
184-
randomization results in non-local pairing of base and view tablets.
185-
We could make this test fail more reliably and only need 4 nodes if
186-
we had an API to control the tablet placement.
187-
We also use the "delay_before_remote_view_update" injection point
188-
to add a delay to the view update - without it it's almost impossible
189-
to reproduce this issue on a fast machine.
194+
195+
We use a cluster of just two nodes and RF=1, and control the tablets
196+
so all base tablets will be in node 0 and all view tablets will be
197+
in node 1, to ensure that the view update is remote and therefore
198+
not synchronous by default. To make the test failure even more
199+
likely on a fast machine, we use the "delay_before_remote_view_update"
200+
injection point to add a delay to the view update more than usual.
190201
Reproduces #16313.
191202
"""
192-
servers = await manager.servers_add(10, config=alternator_config)
203+
servers = await manager.servers_add(2, config=alternator_config)
193204
cql = manager.get_cql()
194205
alternator = get_alternator(servers[0].ip_addr)
195-
# Currently, to create an Alternator table with tablets, a special
196-
# tag must be given. See issue #16203. In the future this should be
197-
# automatic - any Alternator table will use tablets.
198-
tablets_tags = [{'Key': 'experimental:initial_tablets', 'Value': '100'}]
206+
# Tell Alternator to create a table with just *one* tablet, via a
207+
# special tag.
208+
tablets_tags = [{'Key': 'experimental:initial_tablets', 'Value': '1'}]
199209
# Create a table with an LSI
200-
table = alternator.create_table(TableName='alternator_table',
210+
table_name = 'tbl'
211+
index_name = 'ind'
212+
table = alternator.create_table(TableName=table_name,
201213
BillingMode='PAY_PER_REQUEST',
202214
KeySchema=[
203215
{'AttributeName': 'p', 'KeyType': 'HASH' },
@@ -209,7 +221,7 @@ async def test_tablet_alternator_lsi_consistency(manager: ManagerClient):
209221
{'AttributeName': 'd', 'AttributeType': 'S' }
210222
],
211223
LocalSecondaryIndexes=[
212-
{ 'IndexName': 'ind',
224+
{ 'IndexName': index_name,
213225
'KeySchema': [
214226
{ 'AttributeName': 'p', 'KeyType': 'HASH' },
215227
{ 'AttributeName': 'd', 'KeyType': 'RANGE' },
@@ -218,31 +230,37 @@ async def test_tablet_alternator_lsi_consistency(manager: ManagerClient):
218230
}
219231
],
220232
Tags=tablets_tags)
221-
# Above we connected to a one node to perform Alternator requests.
222-
# Now we want to be able to connect to the above-created table through
223-
# each one of the nodes. Let's define tables[i] as the same table accessed
224-
# through node i (all of them refer to the same table):
225-
alternators = [get_alternator(server.ip_addr) for server in servers]
226-
tables = [alternator.Table(table.name) for alternator in alternators]
233+
234+
# This is how Alternator calls the CQL tables that back up the Alternator
235+
# tables:
236+
cql_keyspace_name = 'alternator_' + table_name
237+
cql_table_name = table_name
238+
cql_view_name = table_name + '!:' + index_name
239+
240+
# Verify that the above setup managed to correctly enable tablets, and
241+
# ensure there is just one tablet for each table.
242+
await assert_one_tablet(cql, cql_keyspace_name, cql_table_name)
243+
await assert_one_tablet(cql, cql_keyspace_name, cql_view_name)
244+
245+
# Move the base tablet (there's just one) to node 0, and the view tablet
246+
# to node 1. In particular, all view updates will then be remote: node 0
247+
# will send view updates to node 1.
248+
await pin_the_only_tablet(manager, cql_keyspace_name, cql_table_name, servers[0])
249+
await pin_the_only_tablet(manager, cql_keyspace_name, cql_view_name, servers[1])
227250

228251
await inject_error_on(manager, "delay_before_remote_view_update", servers);
229252

230-
# write to the table through one node (table1) and read from the view
231-
# through another node (table2). In an LSI, it is allowed to use strong
253+
# Write to the base table (which is on node 0) and read from the LSI
254+
# (which is on node 1). In a DynamoDB LSI, it is allowed to use strong
232255
# consistency for the read, and it must return the just-written value.
233-
# Try 10 times, in different combinations of nodes, to increase the
234-
# chance of reproducing the bug.
235-
for i in range(10):
236-
table1 = tables[random.randrange(0, len(tables))]
237-
table2 = tables[random.randrange(0, len(tables))]
238-
item = {'p': 'dog', 'c': 'c'+str(i), 'd': 'd'+str(i)}
239-
table1.put_item(Item=item)
240-
assert [item] == full_query(table2, IndexName='ind',
241-
KeyConditions={
242-
'p': {'AttributeValueList': ['dog'], 'ComparisonOperator': 'EQ'},
243-
'd': {'AttributeValueList': ['d'+str(i)], 'ComparisonOperator': 'EQ'}
244-
}
245-
)
256+
item = {'p': 'dog', 'c': 'c0', 'd': 'd0'}
257+
table.put_item(Item=item)
258+
assert [item] == full_query(table, IndexName=index_name,
259+
KeyConditions={
260+
'p': {'AttributeValueList': ['dog'], 'ComparisonOperator': 'EQ'},
261+
'd': {'AttributeValueList': ['d0'], 'ComparisonOperator': 'EQ'}
262+
}
263+
)
246264
table.delete()
247265

248266
@pytest.mark.asyncio

0 commit comments

Comments
 (0)