Skip to content

Commit d3bdc74

Browse files
op-supervisor: resetTracker fixes (#16028)
* op-supervisor: resetTracker draft changes Co-authored-by: Sebastian Stammler <[email protected]> * op-supervisor: syncnode reset tracker fixes * op-acceptance-tests: unlock sequencing window expiry test * op-supervisor,op-node,tests: fix sequencing window expiry test * op-supervisor,tests: fix review / ci --------- Co-authored-by: Sebastian Stammler <[email protected]>
1 parent 8200c07 commit d3bdc74

File tree

11 files changed

+151
-74
lines changed

11 files changed

+151
-74
lines changed

op-acceptance-tests/tests/interop/seqwindow/expiry_test.go

Lines changed: 65 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package seqwindow
22

33
import (
4+
"strings"
45
"testing"
56
"time"
67

@@ -9,14 +10,14 @@ import (
910
"github.com/ethereum-optimism/optimism/op-devstack/devtest"
1011
"github.com/ethereum-optimism/optimism/op-devstack/presets"
1112
"github.com/ethereum-optimism/optimism/op-service/eth"
13+
"github.com/ethereum-optimism/optimism/op-service/retry"
1214
)
1315

1416
// TestSequencingWindowExpiry tests that the sequencing window may expire,
1517
// the chain reorgs because of it, and that the chain then recovers.
18+
// This test can take 3 minutes to run.
1619
func TestSequencingWindowExpiry(gt *testing.T) {
1720
t := devtest.SerialT(gt)
18-
// TODO(#15769): fix sequencing-window expiry interop functionality
19-
t.Skip("Interop sequencing-window expiry interaction is known to not fully work yet.")
2021

2122
sys := presets.NewSimpleInterop(t)
2223
require := t.Require()
@@ -39,7 +40,14 @@ func TestSequencingWindowExpiry(gt *testing.T) {
3940
t.Logger().Info("Tx 1 is safe now")
4041

4142
// Stop the batcher of chain A, so the L2 unsafe blocks will not get submitted.
42-
require.NoError(sys.L2BatcherA.ActivityAPI().StopBatcher(t.Ctx()))
43+
// This may take a while, since the batcher is still actively submitting new data.
44+
require.NoError(retry.Do0(t.Ctx(), 10, retry.Exponential(), func() error {
45+
err := sys.L2BatcherA.ActivityAPI().StopBatcher(t.Ctx())
46+
if err != nil && strings.Contains(err.Error(), "batcher is not running") {
47+
return nil
48+
}
49+
return err
50+
}))
4351
stoppedAt := sys.L1Network.WaitForBlock() // wait for new block, in case there is any batch left
4452
// Make sure the supervisor has synced enough of the L1, for the local-safe query to work.
4553
sys.Supervisor.AwaitMinL1(stoppedAt.Number)
@@ -75,13 +83,34 @@ func TestSequencingWindowExpiry(gt *testing.T) {
7583
"tx", receipt2.TxHash, "l2Block", old)
7684
// The logs will show a "Chain reorg detected" from op-geth.
7785

86+
// Once this happens we don't want to continue to try and include the tx in a block again,
87+
// since that will then be reorged out again.
88+
// We need to enter recovery-mode to not continue to build an incompatible chain that will not get submitted.
89+
// It may reorg once more, but then stays compatible.
90+
// For a while we'll have to build blocks that are not going to be reorged out due to subtle L1 origin divergence.
91+
t.Logger().Info("Turning on recovery-mode")
92+
t.Require().NoError(sys.L2CLA.Escape().RollupAPI().SetRecoverMode(t.Ctx(), true))
93+
7894
t.Logger().Info("Waiting for sequencing window expiry induced reorg now")
7995
// Monitor that the old unsafe chain is reorged out as expected
8096
require.Eventually(func() bool {
8197
latest := sys.L2ELA.BlockRefByNumber(old.Number)
8298
return latest.Hash != old.Hash
8399
}, windowDuration+time.Second*20, time.Second, "expecting old block to be reorged out")
84100

101+
// Wait for the tx to no longer be included.
102+
// The tx-indexer may still return the old block or be stale.
103+
// So instead, lookup the tx nonce
104+
require.Eventually(func() bool {
105+
latestNonce, err := sys.L2ELA.Escape().EthClient().NonceAt(t.Ctx(), alice.Address(), nil)
106+
if err != nil {
107+
t.Logger().Error("Failed to look up pending nonce")
108+
return false
109+
}
110+
t.Logger().Info("Checking tx 2 nonce", "latest", latestNonce, "tx2", tx2.Nonce.Value())
111+
return latestNonce <= tx2.Nonce.Value()
112+
}, windowDuration+time.Second*20, time.Second, "tx should be reorged out and not come back")
113+
85114
t.Logger().Info("Waiting for supervisor to surpass pre-reorg chain now")
86115
// Monitor that the supervisor can continue to sync.
87116
// A lot more blocks will expire first; the local-safe chain will be entirely force-derived blocks.
@@ -102,6 +131,39 @@ func TestSequencingWindowExpiry(gt *testing.T) {
102131
other := sys.L2ELA.BlockRefByNumber(safe.Derived.Number)
103132
require.Equal(safe.Derived.Hash, other.Hash, "supervisor must match chain with EL")
104133

134+
t.Logger().Info("Re-enabling batch-submitter")
105135
// re-enable the batcher now that we are done with the test.
106136
t.Require().NoError(sys.L2BatcherA.ActivityAPI().StartBatcher(t.Ctx()))
137+
// TODO(#16036): batcher submits future span batch, misses a L2 block.
138+
// For now it uses singular batches to work-around.
139+
140+
// Build the missing blocks, catch up on local-safe chain
141+
t.Require().Eventually(func() bool {
142+
status := sys.L2CLA.SyncStatus()
143+
return status.LocalSafeL2.Number+10 > status.UnsafeL2.Number
144+
}, windowDuration, time.Second*2, "wait for local-safe to recover near unsafe head")
145+
146+
// Once we have enough margin to not get reorged again before the batch-submitter acts,
147+
// exit recovery mode, so we can include txs again.
148+
t.Logger().Info("Exiting recovery mode")
149+
t.Require().NoError(sys.L2CLA.Escape().RollupAPI().SetRecoverMode(t.Ctx(), false))
150+
151+
// Now confirm a tx, chain should be healthy again.
152+
tx3 := alice.Transfer(common.HexToAddress("0x7777"), eth.GWei(100))
153+
receipt3, err := tx3.Included.Eval(t.Ctx())
154+
require.NoError(err)
155+
t.Logger().Info("Confirmed tx 3", "tx", receipt3.TxHash, "block", receipt3.BlockHash)
156+
157+
// Wait for the first tx to become cross-safe.
158+
// We are not interested in the sequencing window to expire and revert all the way back to 0.
159+
require.Eventually(func() bool {
160+
status := sys.L2CLA.SyncStatus()
161+
t.Logger().Info("Awaiting tx safety",
162+
"local-unsafe", status.UnsafeL2, "local-safe", status.LocalSafeL2)
163+
return status.SafeL2.Number > receipt3.BlockNumber.Uint64()
164+
}, time.Second*60, time.Second, "wait for tx 3 to be safe")
165+
t.Logger().Info("Tx 3 is safe now")
166+
// Sanity check the block the tx was included is really still canonical
167+
got := sys.L2ELA.BlockRefByNumber(receipt3.BlockNumber.Uint64())
168+
t.Require().Equal(receipt3.BlockHash, got.Hash, "tx 3 was included in canonical block")
107169
}

op-acceptance-tests/tests/interop/seqwindow/init_test.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,24 @@ package seqwindow
33
import (
44
"testing"
55

6+
bss "github.com/ethereum-optimism/optimism/op-batcher/batcher"
67
"github.com/ethereum-optimism/optimism/op-devstack/presets"
8+
"github.com/ethereum-optimism/optimism/op-devstack/stack"
9+
"github.com/ethereum-optimism/optimism/op-devstack/sysgo"
10+
"github.com/ethereum-optimism/optimism/op-node/rollup/derive"
711
)
812

913
func TestMain(m *testing.M) {
1014
presets.DoMain(m,
1115
presets.WithSimpleInterop(),
1216
// Short enough that we can run the test,
1317
// long enough that the batcher can still submit something before we make things expire.
14-
presets.WithSequencingWindow(10, 30))
18+
presets.WithSequencingWindow(10, 30),
19+
stack.MakeCommon(sysgo.WithBatcherOption(func(id stack.L2BatcherID, cfg *bss.CLIConfig) {
20+
// Span-batches during recovery don't appear to align well with the starting-point.
21+
// It can be off by ~6 L2 blocks, possibly due to off-by-one
22+
// in L1 block sync considerations in batcher stop or start.
23+
// So we end up having to encode block by block, so the full batch data does not get dropped.
24+
cfg.BatchType = derive.SingularBatchType
25+
})))
1526
}

op-devstack/sysgo/l2_batcher.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,14 @@ func (b *L2Batcher) hydrate(system stack.ExtensibleSystem) {
4444
l2Net.(stack.ExtensibleL2Network).AddL2Batcher(bFrontend)
4545
}
4646

47+
type BatcherOption func(id stack.L2BatcherID, cfg *bss.CLIConfig)
48+
49+
func WithBatcherOption(opt BatcherOption) stack.Option[*Orchestrator] {
50+
return stack.Deploy[*Orchestrator](func(orch *Orchestrator) {
51+
orch.batcherOptions = append(orch.batcherOptions, opt)
52+
})
53+
}
54+
4755
func WithBatcher(batcherID stack.L2BatcherID, l1ELID stack.L1ELNodeID, l2CLID stack.L2CLNodeID, l2ELID stack.L2ELNodeID) stack.Option[*Orchestrator] {
4856
return stack.AfterDeploy(func(orch *Orchestrator) {
4957
require := orch.P().Require()
@@ -94,12 +102,16 @@ func WithBatcher(batcherID stack.L2BatcherID, l1ELID stack.L1ELNodeID, l2CLID st
94102
Stopped: false,
95103
BatchType: derive.SpanBatchType,
96104
MaxBlocksPerSpanBatch: 10,
105+
PreferLocalSafeL2: l2CL.cfg.Rollup.InteropTime != nil,
97106
DataAvailabilityType: batcherFlags.CalldataType,
98107
CompressionAlgo: derive.Brotli,
99108
RPC: oprpc.CLIConfig{
100109
EnableAdmin: true,
101110
},
102111
}
112+
for _, opt := range orch.batcherOptions {
113+
opt(batcherID, batcherCLIConfig)
114+
}
103115

104116
batcher, err := bss.BatcherServiceFromCLIConfig(
105117
orch.P().Ctx(), "0.0.1", batcherCLIConfig,

op-devstack/sysgo/orchestrator.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ type Orchestrator struct {
2626
// nil if no time travel is supported
2727
timeTravelClock *clock.AdvancingClock
2828

29+
// options
30+
batcherOptions []BatcherOption
31+
2932
superchains locks.RWMap[stack.SuperchainID, *Superchain]
3033
clusters locks.RWMap[stack.ClusterID, *Cluster]
3134
l1Nets locks.RWMap[eth.ChainID, *L1Network]

op-node/rollup/derive/base_batch_stage.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ func (bs *baseBatchStage) deriveNextEmptyBatch(ctx context.Context, outOfData bo
188188
// to preserve that L2 time >= L1 time. If this is the first block of the epoch, always generate a
189189
// batch to ensure that we at least have one batch per epoch.
190190
if nextTimestamp < nextEpoch.Time || firstOfEpoch {
191-
bs.log.Info("Generating next batch", "epoch", epoch, "timestamp", nextTimestamp)
191+
bs.log.Info("Generating next batch", "epoch", epoch, "timestamp", nextTimestamp, "parent", parent)
192192
return &SingularBatch{
193193
ParentHash: parent.Hash,
194194
EpochNum: rollup.Epoch(epoch.Number),

op-service/apis/eth.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,9 @@ type TransactionSender interface {
109109
type EthNonce interface {
110110
// PendingNonceAt returns the account nonce of the given account in the pending state.
111111
PendingNonceAt(ctx context.Context, account common.Address) (uint64, error)
112+
// NonceAt returns the account nonce of the given account in the state at the given block number.
113+
// A nil block number may be used to get the latest state.
114+
NonceAt(ctx context.Context, account common.Address, blockNumber *big.Int) (uint64, error)
112115
}
113116

114117
type EthBalance interface {

op-service/sources/eth_client.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,14 @@ func (s *EthClient) PendingNonceAt(ctx context.Context, account common.Address)
548548
return uint64(result), err
549549
}
550550

551+
// NonceAt returns the account nonce of the given account in the state at the given block number.
552+
// A nil block number may be used to get the latest state.
553+
func (s *EthClient) NonceAt(ctx context.Context, account common.Address, blockNumber *big.Int) (uint64, error) {
554+
var result hexutil.Uint64
555+
err := s.client.CallContext(ctx, &result, "eth_getTransactionCount", account, toBlockNumArg(blockNumber))
556+
return uint64(result), err
557+
}
558+
551559
func toBlockNumArg(number *big.Int) string {
552560
if number == nil {
553561
return "latest"

op-supervisor/supervisor/backend/rewinder/rewinder.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ func (r *Rewinder) handleLocalDerivedEvent(ev superevents.LocalSafeUpdateEvent)
135135
break
136136
}
137137

138+
r.log.Warn("Rewinding logs DB", "chain", ev.ChainID, "target", target)
138139
// Try to rewind and stop if it succeeds
139140
err = r.db.RewindLogs(ev.ChainID, target)
140141
if err != nil {

op-supervisor/supervisor/backend/syncnode/node.go

Lines changed: 19 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -449,63 +449,30 @@ func (m *ManagedNode) Close() error {
449449
func (m *ManagedNode) resetIfInconsistent() {
450450
ctx, cancel := context.WithTimeout(m.ctx, internalTimeout)
451451
defer cancel()
452-
453-
// Handle the given seen block from the node if it is an inconsistency.
454-
handle := func(name string, seenFromNode eth.BlockID) (inconsistent bool) {
455-
if seenFromNode == (eth.BlockID{}) {
456-
return false // if we haven't seen anything, then don't reset to it
457-
}
458-
// We know the last block from the op-node.
459-
// Ae may have references that:
460-
// - local-unsafe matches, and local-safe matches -> no inconsistency caused by latest signaled local-unsafe!
461-
// - local-unsafe matches, but local-safe differs -> e.g. if local-safe is behind and does not know about this block
462-
// - local-unsafe differs, but local-safe matches -> we should stick to the local-safe chain, not rewind further, but may need to rewind some
463-
// - local-unsafe differs, and local-safe differs -> we should use the local-safe chain as truth
464-
//
465-
// And never should we reset if the node is just ahead of us; resetIfAhead handles that.
466-
467-
m.log.Debug("Checking last seen block from node for consistency", "safety", name, "block", seenFromNode)
468-
localUnsafeMatchErr := m.backend.IsLocalUnsafe(ctx, m.chainID, seenFromNode)
469-
if errors.Is(localUnsafeMatchErr, types.ErrFuture) {
470-
localUnsafeMatchErr = nil // do not count it when the node is ahead of us
471-
}
472-
if localUnsafeMatchErr != nil {
473-
m.log.Warn("Last seen block from node is inconsistent with supervisor local-unsafe blocks",
474-
"safety", name, "err", localUnsafeMatchErr)
475-
}
476-
localSafeMatchErr := m.backend.IsLocalSafe(ctx, m.chainID, seenFromNode)
477-
if errors.Is(localSafeMatchErr, types.ErrFuture) {
478-
localSafeMatchErr = nil // do not count it when the node is ahead of us
479-
}
480-
if localSafeMatchErr != nil {
481-
m.log.Warn("Last seen block from node is inconsistent with supervisor local-safe blocks",
482-
"safety", name, "err", localSafeMatchErr)
483-
}
484-
if localUnsafeMatchErr != nil || localSafeMatchErr != nil {
485-
// If either mismatches, we want to reset back no further than latest local-safe
486-
localSafe, err := m.backend.LocalSafe(ctx, m.chainID)
487-
if err != nil {
488-
m.log.Debug("Cannot determine how to handle inconsistency, no local-safe data available",
489-
"localUnsafeMatchErr", localUnsafeMatchErr,
490-
"localSafeMatchErr", localSafeMatchErr, "err", err)
491-
return false
492-
}
493-
if m.lastNodeLocalUnsafe.Number > localSafe.Derived.Number {
494-
m.resetTracker.beginBisectionReset(m.lastNodeLocalUnsafe)
495-
} else {
496-
m.resetTracker.beginBisectionReset(localSafe.Derived)
497-
}
498-
return true
499-
}
500-
return false
452+
seenFromNode := m.lastNodeLocalSafe
453+
name := "local-safe"
454+
if seenFromNode == (eth.BlockID{}) {
455+
return // if we haven't seen anything, then don't reset to it
501456
}
502-
if handle("local-unsafe", m.lastNodeLocalUnsafe) {
457+
m.log.Debug("Checking last seen block from node for consistency", "safety", name, "block", seenFromNode)
458+
localSafeMatchErr := m.backend.IsLocalSafe(ctx, m.chainID, seenFromNode)
459+
if localSafeMatchErr == nil {
503460
return
504461
}
505-
if handle("local-safe", m.lastNodeLocalSafe) {
462+
if errors.Is(localSafeMatchErr, types.ErrFuture) {
463+
m.log.Debug("node is ahead of op-supervisor local-safe data")
464+
return // resetIfAhead will handle this
465+
}
466+
m.log.Warn("Last seen block from node is inconsistent with supervisor local-safe blocks",
467+
"safety", name, "err", localSafeMatchErr)
468+
// If there is a mismatch, we want to reset back no further than latest local-safe
469+
localSafe, err := m.backend.LocalSafe(ctx, m.chainID)
470+
if err != nil {
471+
m.log.Debug("Cannot determine how to handle inconsistency, no local-safe data available",
472+
"localSafeMatchErr", localSafeMatchErr, "err", err)
506473
return
507474
}
508-
m.log.Debug("no inconsistency found")
475+
m.resetTracker.beginBisectionReset(localSafe.Derived)
509476
}
510477

511478
// resetIfAhead checks if the node is ahead of the local-safe db

op-supervisor/supervisor/backend/syncnode/node_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ func TestPrepareReset(t *testing.T) {
139139
pivot = i
140140
node.resetTracker.bisectToTarget()
141141
require.Equal(t, i, unsafe.Number)
142-
require.Equal(t, uint64(0), safe.Number)
142+
require.Equal(t, i, safe.Number)
143143
require.Equal(t, uint64(0), finalized.Number)
144144
}
145145

0 commit comments

Comments
 (0)