Skip to content

Commit bcff15e

Browse files
authored
Beacon sync remove finalised hash early resolver (#3236)
* Fix potential race condition/crash info This might fix issue 3227. * Remove `accept()` handshake for early finalised hash resolving why The benefits of keeping that early resolution feature do not justify the annoyances/code-complexity increase. All that the early resolver did was fetching some finalised header early. The same header was fetched later again with the download of the chain of headers needed for syncing. It turns out that the early knowledge of a resolved finalised hash was never needed, nor used. Even the lazy resolution of the finalised hash (while collecting headers) seems to be seldom needed. On the other hand, there were hive test problems with the early resolver caused by the way the database delivery for the hive tests work.
1 parent 4a43f57 commit bcff15e

File tree

8 files changed

+78
-138
lines changed

8 files changed

+78
-138
lines changed

execution_chain/core/chain/header_chain_cache.nim

Lines changed: 47 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -103,17 +103,14 @@ type
103103
## +-------+
104104
##
105105
closed = 0 # no session
106-
notified # client app was offered a new session
107106
collecting # may append session headers
108107
ready # `ante` has a parent on the `FC` module
109108
orphan # no way to link into `FC` module
110109
locked # done with session headers, read-only state
111110

112-
HeaderChainNotifyCB* =
113-
proc(hdrNum: BlockNumber, finHash: Hash32) {.gcsafe, raises: [].}
114-
## Call back function telling a client app that a new chain was initialised.
115-
## The hash passed as argument is from the finaliser and must be resolved
116-
## as header argument in `startSession()`.
111+
HeaderChainNotifyCB* = proc() {.gcsafe, raises: [].}
112+
## Call back function telling a client app that a new session was
113+
## initialised and has started.
117114

118115
HeaderChainRef* = ref object
119116
## Module descriptor
@@ -269,6 +266,18 @@ proc tryFcParent(hc: HeaderChainRef; hdr: Header): HeaderChainMode =
269266
# The block number of `hdr` must not go below the `base`. It cannot be
270267
# handled even if it has a parent on the block chain (but not on the
271268
# `FC` module.)
269+
#
270+
# Rationale:
271+
# This situataion might arise by means of `CL` requests via RPC updating
272+
# the `base` concurrently to a syncer client. This can only happen if the
273+
# base is near the canonical head. Which in turn means that the header
274+
# chain is short.
275+
#
276+
# Reversing the antecedent (above `base`) is avoided. This goes with the
277+
# argument that completely re-syncing a short header chain is worth it
278+
# comparing additional administrative costs (for a syncer client of that
279+
# module) of handling backward moves of the antecedent.
280+
#
272281
if hdr.number <= baseNum:
273282
return orphan # beyond reach
274283

@@ -278,10 +287,10 @@ proc tryFcParent(hc: HeaderChainRef; hdr: Header): HeaderChainMode =
278287
# `hdr` could have a parent on the `FC` module which obviously failed
279288
# (i.e. `base` is not parent of `hdr`.)
280289
if hc.session.finHeader.number <= baseNum:
281-
# So `finalised` is ancestor of, or equal to `base`. In any case,
282-
# `hdr` has no parent on the canonical branch up to `base`. So it is
283-
# on another branch.
284-
return orphan # on the wrong branch
290+
# So, if resolved at all then `finalised` is ancestor of, or equal to
291+
# `base`. In any case, `hdr` has no parent on the canonical branch up
292+
# to `base`. So it is on another branch.
293+
return orphan # maybe on the wrong branch
285294

286295
# Now `base` and `finalised` (and all ancestors of either) are on the
287296
# the canonical branch of the DAG generated by the `CL` logic.
@@ -312,14 +321,16 @@ proc headUpdateFromCL(hc: HeaderChainRef; h: Header; f: Hash32) =
312321
if hc.session.mode == closed:
313322
# Set new session environment
314323
hc.session = HccSession( # start new session
315-
mode: notified,
324+
mode: collecting,
316325
ante: h,
317326
head: h,
318327
headHash: h.computeBlockHash(),
319328
finHash: f)
320329

321-
# Inform client app about that a new session is available.
322-
hc.notify(h.number, f)
330+
hc.kvt.putHeader(h.number, encodePayload h)
331+
332+
# Inform client app about that a new session has started.
333+
hc.notify()
323334

324335
# For logging and metrics
325336
hc.session.consHeadNum = h.number
@@ -334,40 +345,13 @@ func state*(hc: HeaderChainRef): HeaderChainMode =
334345
## Requested system state for running function:
335346
## ::
336347
## closed -- *internal*
337-
## notified -- accept()
338348
## collecting -- put()
339349
## ready -- complete()
340350
## orphan -- n/a
341351
## locked -- importBlock() from FC module
342352
##
343353
hc.session.mode
344354

345-
proc accept*(hc: HeaderChainRef; fin: Header): bool =
346-
## Accept and activate session.
347-
##
348-
## Required system state is to run this function is `notified`.
349-
##
350-
if hc.state == notified and
351-
hc.baseNum < fin.number and
352-
fin.number <= hc.session.head.number:
353-
354-
let finHash = fin.computeBlockHash()
355-
if hc.session.finHash != finHash:
356-
return false
357-
358-
hc.session.finHeader = fin
359-
hc.kvt.putHeader(hc.session.head.number, encodePayload hc.session.head)
360-
361-
# TODO: syncer and also ForkedCache should not start session
362-
# using finalizedHash, as evident from hive test
363-
# the FCU head can have block number bigger than finalized block.
364-
# Syncer and ForkedCache should only deal with FCU headHash.
365-
# TODO: move notifyBlockHashAndNumber call to fcPutHeader
366-
# where one of the headers should also the finalized header.
367-
hc.chain.notifyBlockHashAndNumber(finHash, fin.number)
368-
369-
hc.session.mode = collecting
370-
return true
371355

372356
proc clear*(hc: HeaderChainRef) =
373357
## Clear and flush current header chain cache session. This applies to an
@@ -493,6 +477,14 @@ proc put*(
493477
return err("Argument rev[] exceeds chain head " &
494478
hc.session.head.bnStr)
495479

480+
# Check whether the `FC` module has changed and the current antecedent
481+
# already is the end of it.
482+
block:
483+
let newMode = hc.tryFcParent(hc.session.ante)
484+
if newMode in {ready,orphan}:
485+
hc.session.mode = newMode
486+
return ok()
487+
496488
# Start at the entry that is parent to `ante` (if any)
497489
let offset = ((lastNumber + 1) - hc.session.ante.number).int
498490
if offset < rev.len:
@@ -524,12 +516,18 @@ proc put*(
524516
return err("Parent hash mismatch for rev[" & $n & "].number=" &
525517
bn.bnStr)
526518

527-
# Check whether the current header matches the `finalised` one. If so,
528-
# headers/hashes must match.
529-
if hc.session.finHeader.number == bn:
530-
if hc.session.finHash != hash:
531-
raiseAssert RaisePfx & "finaliser not on header chain: " & bn.bnStr &
532-
" hash=" & hc.session.finHash.short & " expected=" & hash.short
519+
# Resolve finaliser if possible.
520+
if hc.session.finHash == hash:
521+
hc.session.finHeader = hdr
522+
523+
# Ackn: moved here from `accept()`
524+
# TODO: syncer and also ForkedCache should not start session
525+
# using finalizedHash, as evident from hive test
526+
# the FCU head can have block number bigger than finalized block.
527+
# Syncer and ForkedCache should only deal with FCU headHash.
528+
# TODO: move notifyBlockHashAndNumber call to fcPutHeader
529+
# where one of the headers should also the finalized header.
530+
hc.chain.notifyBlockHashAndNumber(hash, bn)
533531

534532
# Check whether `hdr` has a parent on the `FC` module.
535533
let newMode = hc.tryFcParent(hdr)
@@ -570,6 +568,9 @@ proc commit*(hc: HeaderChainRef): Result[void,string] =
570568

571569
let baseNum = hc.baseNum
572570
if baseNum < hc.session.finHeader.number:
571+
#
572+
# So the `finalised` hash was resolved (otherwise it would have a zero
573+
# block number.)
573574
#
574575
# There are two segments of the canonical chain, `base..finalised` and
575576
# `ante..finalised` (the latter possibly degraded) which both share at

execution_chain/sync/beacon/worker.nim

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -187,10 +187,6 @@ proc runPeer*(
187187
buddy.only.multiRunIdle = Moment.now() - buddy.only.stoppedMultiRun
188188
buddy.only.nMultiLoop.inc # statistics/debugging
189189

190-
# Resolve consensus header target when needed. It comes with a finalised
191-
# header hash where we need to complete the block number.
192-
await buddy.headerStagedResolveFinalizer info
193-
194190
if not await buddy.napUnlessSomethingToFetch():
195191

196192
# Download and process headers and blocks

execution_chain/sync/beacon/worker/headers_staged.nim

Lines changed: 1 addition & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -11,50 +11,17 @@
1111
{.push raises:[].}
1212

1313
import
14-
std/sets,
1514
pkg/[chronicles, chronos],
1615
pkg/eth/common,
1716
pkg/stew/[interval_set, sorted_set],
1817
../worker_desc,
1918
./headers_staged/[headers, staged_collect],
20-
./[headers_unproc, update]
19+
./headers_unproc
2120

2221
# ------------------------------------------------------------------------------
2322
# Public functions
2423
# ------------------------------------------------------------------------------
2524

26-
proc headerStagedResolveFinalizer*(
27-
buddy: BeaconBuddyRef;
28-
info: static[string];
29-
) {.async: (raises: []).} =
30-
## Fetch finalised beacon header if there is an update available
31-
let ctx = buddy.ctx
32-
if ctx.pool.lastState == idleSyncState and
33-
ctx.pool.finRequest != zeroHash32:
34-
35-
# Grap `finRequest`. So no other peer will interfere
36-
let finHash = ctx.pool.finRequest
37-
ctx.pool.finRequest = zeroHash32
38-
39-
# Fetch header
40-
const iv = BnRange.new(0,0) # dummy interval of length 1
41-
let fin = (await buddy.headersFetchReversed(iv, finHash, info)).valueOr:
42-
# Postponed, try later
43-
ctx.pool.finRequest = finHash
44-
ctx.pool.failedPeers.incl buddy.peerID
45-
46-
debug info & ": finalised hash not yet resolved", peer=buddy.peer,
47-
finHash=finHash.short, hdrErrors=buddy.hdrErrors,
48-
failedPeers=ctx.pool.failedPeers.len
49-
50-
# Check failed peers whether there was a limit reaced and a reset is
51-
# necessary.
52-
ctx.updateFailedPeersFromResolvingFinalizer info
53-
return
54-
55-
ctx.updateFromHibernateSetTarget(fin[0], info)
56-
57-
5825
func headersStagedFetchOk*(buddy: BeaconBuddyRef): bool =
5926
# Helper for `worker.nim`, etc.
6027
0 < buddy.ctx.headersUnprocAvail() and

execution_chain/sync/beacon/worker/headers_staged/staged_collect.nim

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,16 +132,27 @@ proc collectAndStashOnDiskCache*(
132132
ctrl=buddy.ctrl.state, hdrErrors=buddy.hdrErrors, `error`=error
133133
break fetchHeadersBody # error => exit block
134134

135-
# Antecedent `dangling` of the header cache might not be at `rev[^1]`.
136-
let revLen = rev[0].number - ctx.dangling.number + 1
135+
# Note that `put()` might not have used all of the `rev[]` items for
136+
# updating the antecedent (aka `ctx.dangling`.) So `rev[^1]` might be
137+
# an ancestor of the antecedent.
138+
#
139+
# By design, the unused items from `rev[]` list may savely be considered
140+
# as consumed so that the next cycle can continue. In practice, if there
141+
# are used `rev[]` items then the `state` will have changed which is
142+
# handled in the `while` loop header clause.
143+
#
144+
# Other possibilities would imply that `put()` was called from several
145+
# instances with oberlapping `rev[]` argument lists which is included
146+
# by the administration of the `iv` argument for this function
147+
# `collectAndStashOnDiskCache()`.
137148

138149
# Update remaining range to fetch and check for end-of-loop condition
139-
let newTopBefore = ivTop - revLen
150+
let newTopBefore = ivTop - BlockNumber(rev.len)
140151
if newTopBefore < iv.minPt:
141152
break # exit while() loop
142153

143154
ivTop = newTopBefore # mostly results in `ivReq.minPt-1`
144-
parent = rev[revLen-1].parentHash # parent hash for next fetch request
155+
parent = rev[^1].parentHash # parent hash for next fetch request
145156
# End loop
146157

147158
trace info & ": fetched and stored headers", peer, iv,

execution_chain/sync/beacon/worker/start_stop.nim

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ import
1818
../worker_desc,
1919
./blocks_staged/staged_queue,
2020
./headers_staged/staged_queue,
21-
./[blocks_unproc, headers_unproc]
21+
./[blocks_unproc, headers_unproc, update]
2222

2323
type
2424
SyncStateData = tuple
@@ -67,15 +67,10 @@ proc setupServices*(ctx: BeaconCtxRef; info: static[string]) =
6767
# into `ForkedChainRef` (i.e. `ctx.pool.chain`.)
6868
ctx.pool.hdrCache = HeaderChainRef.init(ctx.pool.chain)
6969

70-
# Set up new notifier informing when the new head is available from the
71-
# `CL`. If there is a new session available, the notifier is called with
72-
# the hash of a finalised header that needs to be resolved. This hash is
73-
# stored on the `ctx.pool` descriptor to be picked up by the next available
74-
# sync peer.
75-
ctx.hdrCache.start proc(hdr: BlockNumber, fin: Hash32) =
76-
ctx.pool.finRequest = fin
77-
info "Finalised hash registered", target=hdr.bnStr,
78-
finHash=fin.short, nSyncPeers=ctx.pool.nBuddies
70+
# Set up the notifier informing when a new syncer session has started.
71+
ctx.hdrCache.start proc() =
72+
# Activates the syncer. Work will be picked up by peers when available.
73+
ctx.updateFromHibernateSetTarget info
7974

8075
# Manual first run?
8176
if 0 < ctx.clReq.consHead.number:

execution_chain/sync/beacon/worker/update.nim

Lines changed: 10 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -215,30 +215,24 @@ proc updateSyncState*(ctx: BeaconCtxRef; info: static[string]) =
215215

216216
proc updateFromHibernateSetTarget*(
217217
ctx: BeaconCtxRef;
218-
fin: Header;
219218
info: static[string];
220219
) =
221220
## If in hibernate mode, accept a cache session and activate syncer
222221
##
223222
if ctx.hibernate:
224-
# Clear cul-de-sac prevention registry. This applies to both, successful
225-
# access handshake or error.
226-
ctx.pool.failedPeers.clear()
223+
let (b, t) = (ctx.chain.baseNumber, ctx.hdrCache.head.number)
227224

228-
if ctx.hdrCache.accept(fin):
229-
let (b, t) = (ctx.chain.baseNumber, ctx.head.number)
225+
# Exclude the case of a single header chain which would be `T` only
226+
if b+1 < t:
227+
ctx.pool.lastState = collectingHeaders # state transition
228+
ctx.hibernate = false # wake up
230229

231-
# Exclude the case of a single header chain which would be `T` only
232-
if b+1 < t:
233-
ctx.pool.lastState = collectingHeaders # state transition
234-
ctx.hibernate = false # wake up
230+
# Update range
231+
ctx.headersUnprocSet(b+1, t-1)
235232

236-
# Update range
237-
ctx.headersUnprocSet(b+1, t-1)
238-
239-
info "Activating syncer", base=b.bnStr,
240-
head=ctx.chain.latestNumber.bnStr, fin=fin.bnStr, target=t.bnStr
241-
return
233+
info "Activating syncer", base=b.bnStr,
234+
head=ctx.chain.latestNumber.bnStr, target=t.bnStr
235+
return
242236

243237
# Failed somewhere on the way
244238
ctx.hdrCache.clear()
@@ -247,23 +241,6 @@ proc updateFromHibernateSetTarget*(
247241
head=ctx.chain.latestNumber.bnStr, state=ctx.hdrCache.state
248242

249243

250-
proc updateFailedPeersFromResolvingFinalizer*(
251-
ctx: BeaconCtxRef;
252-
info: static[string];
253-
) =
254-
## This function resets the finalised hash resolver if there are too many
255-
## sync peer download errors. This prevents from a potential deadlock if
256-
## the syncer is notified with bogus `(finaliser,sync-head)` data.
257-
##
258-
if ctx.hibernate:
259-
if fetchFinalisedHashResolverFailedPeersHwm < ctx.pool.failedPeers.len:
260-
info "Abandoned resolving hash (try next)",
261-
finHash=ctx.pool.finRequest.short, failedPeers=ctx.pool.failedPeers.len
262-
ctx.hdrCache.clear()
263-
ctx.pool.failedPeers.clear()
264-
ctx.pool.finRequest = zeroHash32
265-
266-
267244
proc updateAsyncTasks*(
268245
ctx: BeaconCtxRef;
269246
): Future[Opt[void]] {.async: (raises: []).} =

execution_chain/sync/beacon/worker_config.nim

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,6 @@ const
5353

5454
# ----------------------
5555

56-
fetchFinalisedHashResolverFailedPeersHwm* = 10
57-
## If there are more failing peers than this `hwm` when initially resolving
58-
## the finalised hash then the whole procedure gets cancelled and reset.
59-
## The system will proceed waiting for a new request to resolve a finalised
60-
## hash.
61-
6256
fetchHeadersFailedInitialFailPeersHwm* = 30
6357
## If there are more failing peers than this `hwm` right at the begining
6458
## of a header chain download scrum (before any data received), then this

execution_chain/sync/beacon/worker_desc.nim

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,6 @@ type
9797
nBuddies*: int ## Number of active workers
9898
clReq*: SyncClMesg ## Manual first target set up
9999
lastState*: SyncLayoutState ## Last known layout state
100-
finRequest*: Hash32 ## To be resolved before session
101100
hdrSync*: HeaderFetchSync ## Syncing by linked header chains
102101
blkSync*: BlocksFetchSync ## For importing/executing blocks
103102
nextMetricsUpdate*: Moment ## For updating metrics

0 commit comments

Comments
 (0)