Skip to content

Commit 29b5874

Browse files
committed
go/oasis-node/cmd/storage: Add command for offline pruning (POC)
When enabling aggresive pruning node may fall behind. To prevent it, we should offer validators a maintenance command that should be called if pruning is enabled later on, before starting the node.
1 parent 9a12d7f commit 29b5874

File tree

2 files changed

+177
-7
lines changed

2 files changed

+177
-7
lines changed

go/consensus/cometbft/abci/prune.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,8 @@ PruneLoop:
195195
return nil
196196
}
197197

198+
// Warning: When registering new handler DO NOT forget to update the logic for
199+
// "oasis-node storage prune" command as well.
198200
func (p *genericPruner) RegisterHandler(handler consensus.StatePruneHandler) {
199201
p.Lock()
200202
defer p.Unlock()

go/oasis-node/cmd/storage/storage.go

Lines changed: 175 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,13 @@ import (
66
"errors"
77
"fmt"
88
"io/fs"
9+
"math"
910
"os"
1011
"path/filepath"
1112
"strings"
1213
"time"
1314

15+
cmtBlockstore "github.com/cometbft/cometbft/store"
1416
badgerDB "github.com/dgraph-io/badger/v4"
1517
"github.com/spf13/cobra"
1618

@@ -20,6 +22,7 @@ import (
2022
"github.com/oasisprotocol/oasis-core/go/config"
2123
"github.com/oasisprotocol/oasis-core/go/consensus/cometbft/abci"
2224
cmtCommon "github.com/oasisprotocol/oasis-core/go/consensus/cometbft/common"
25+
cmtConfig "github.com/oasisprotocol/oasis-core/go/consensus/cometbft/config"
2326
cometbftBadger "github.com/oasisprotocol/oasis-core/go/consensus/cometbft/db/badger"
2427
cmdCommon "github.com/oasisprotocol/oasis-core/go/oasis-node/cmd/common"
2528
roothash "github.com/oasisprotocol/oasis-core/go/roothash/api"
@@ -70,6 +73,13 @@ WARNING: Ensure you have at least as much of a free disk as your largest databas
7073
RunE: doDBCompactions,
7174
}
7275

76+
pruneCmd = &cobra.Command{
77+
Use: "prune-experimental",
78+
Args: cobra.NoArgs,
79+
Short: "EXPERIMENTAL: trigger pruning for all consensus databases",
80+
RunE: doPrune,
81+
}
82+
7383
logger = logging.GetLogger("cmd/storage")
7484

7585
pretty = cmdCommon.Isatty(1)
@@ -395,7 +405,17 @@ func flattenBadgerDB(db *badgerDB.DB, logger *logging.Logger, path string) error
395405
}
396406

397407
func compactConsensusNodeDB(dataDir string) error {
398-
ldb, ndb, _, err := abci.InitStateStorage(
408+
ndb, err := openConsensusStateNodeDB(dataDir)
409+
if err != nil {
410+
return fmt.Errorf("failed to initialize ABCI storage backend: %w", err)
411+
}
412+
defer ndb.Close()
413+
414+
return ndb.Compact()
415+
}
416+
417+
func openConsensusStateNodeDB(dataDir string) (db.NodeDB, error) {
418+
_, ndb, _, err := abci.InitStateStorage(
399419
&abci.ApplicationConfig{
400420
DataDir: filepath.Join(dataDir, cmtCommon.StateDir),
401421
StorageBackend: config.GlobalConfig.Storage.Backend,
@@ -405,16 +425,163 @@ func compactConsensusNodeDB(dataDir string) error {
405425
// ChainContext: doc.ChainContext(), TODO: Should we read this from the doc?
406426
},
407427
)
408-
if err != nil {
409-
return fmt.Errorf("failed to initialize ABCI storage backend: %w", err)
428+
429+
return ndb, err
430+
}
431+
432+
func doPrune(_ *cobra.Command, args []string) error {
433+
if err := cmdCommon.Init(); err != nil {
434+
cmdCommon.EarlyLogAndExit(err)
435+
}
436+
437+
// TODO consider validating correct mode?
438+
439+
dataDir := cmdCommon.DataDir()
440+
if err := pruneConsensusDBs(dataDir); err != nil {
441+
return fmt.Errorf("failed to prune consensus databases: %w", err)
410442
}
411443

412-
// Close the resources. Both Close and Cleanup only close NodeDB.
413-
// Closing both here, to prevent resource leaks in things change in the future.
444+
return nil
445+
}
446+
447+
func pruneConsensusDBs(dataDir string) error {
448+
if config.GlobalConfig.Consensus.Prune.Strategy == cmtConfig.PruneStrategyNone {
449+
logger.Info("skipping consensus pruning: (strategy=%s)", cmtConfig.PruneStrategyNone)
450+
return nil
451+
}
452+
453+
ndb, err := openConsensusStateNodeDB(dataDir)
454+
if err != nil {
455+
return fmt.Errorf("failed to open NodeDB: %w", err)
456+
}
414457
defer ndb.Close()
415-
defer ldb.Cleanup()
416458

417-
return ndb.Compact()
459+
latest, ok := ndb.GetLatestVersion()
460+
if !ok {
461+
logger.Info("skipping consensus pruning as state db is empty")
462+
return nil
463+
}
464+
465+
earliest, err := pruneConsensusState(dataDir, ndb, latest)
466+
if err != nil {
467+
return fmt.Errorf("failed to prune application state: %w", err)
468+
}
469+
470+
if err := pruneCometDBs(dataDir, int64(earliest)); err != nil {
471+
return fmt.Errorf("failed to prune CometBFT managed databases: %w", err)
472+
}
473+
474+
return nil
475+
}
476+
477+
func pruneConsensusState(dataDir string, ndb db.NodeDB, latest uint64) (uint64, error) {
478+
if latest < config.GlobalConfig.Consensus.Prune.NumKept {
479+
logger.Info("consensus state pruning skipped: latest version is smaller than the number of versions to keep")
480+
return latest, nil
481+
}
482+
483+
// In case of configured runtimes, we should not prune past the latest reindexed
484+
// consensus height, so that light history can be populated correctly.
485+
minReindexed, err := minReindexedHeight(dataDir)
486+
if err != nil {
487+
return 0, fmt.Errorf("failed to fetch minimum reindexed consensus height: %w", err)
488+
}
489+
490+
start := ndb.GetEarliestVersion()
491+
end := min(
492+
latest-config.GlobalConfig.Consensus.Prune.NumKept, // does not underflow due to if at the top.
493+
uint64(minReindexed),
494+
)
495+
496+
if end <= start {
497+
logger.Info("consensus state already pruned")
498+
return end, nil
499+
}
500+
501+
logger.Info("pruning consensus state", "start", start, "end", end)
502+
for i := start; i < end; i++ {
503+
if err := ndb.Prune(i); err != nil {
504+
return 0, fmt.Errorf("failed to prune version %d: %w", i, err)
505+
}
506+
507+
if i%10_000 == 0 { // TODO not sure this is even needed.
508+
if err := ndb.Sync(); err != nil {
509+
return 0, fmt.Errorf("failed to sync NodeDB: %w", err)
510+
}
511+
logger.Debug("forcing NodeDB disk sync during pruning", "version", i)
512+
}
513+
}
514+
515+
if err := ndb.Sync(); err != nil {
516+
return 0, fmt.Errorf("failed to sync NodeDB: %w", err)
517+
}
518+
519+
return end, nil
520+
}
521+
522+
// minReindexedHeight returns the smallest consensus height reindexed by any
523+
// of the configured runtimes.
524+
//
525+
// In case of no configured runtimes it returns max int64.
526+
func minReindexedHeight(dataDir string) (int64, error) {
527+
fetchLastReindexedHeight := func(runtimeID common.Namespace) (int64, error) {
528+
rtDir := runtimeConfig.GetRuntimeStateDir(dataDir, runtimeID)
529+
mode := config.GlobalConfig.Mode
530+
hasLocalStorage := mode.HasLocalStorage() && !mode.IsArchive()
531+
532+
// TODO ideally we would not start whole light history with all background workers, but this would
533+
// require as to refactor existing code...
534+
history, err := history.New(runtimeID, rtDir, history.NewNonePrunerFactory(), hasLocalStorage)
535+
if err != nil {
536+
return 0, fmt.Errorf("failed to open new light history: %w", err)
537+
}
538+
defer history.Close()
539+
540+
h, err := history.LastConsensusHeight()
541+
if err != nil {
542+
return 0, fmt.Errorf("failed to get last consensus height: %w", err)
543+
}
544+
545+
return h, nil
546+
}
547+
548+
var minH int64 = math.MaxInt64
549+
for _, rt := range config.GlobalConfig.Runtime.Runtimes {
550+
h, err := fetchLastReindexedHeight(rt.ID)
551+
if err != nil {
552+
return 0, fmt.Errorf("failed to fetch last reindexed height for %s: %w", rt.ID, err)
553+
}
554+
555+
if h < minH {
556+
minH = h
557+
}
558+
}
559+
560+
return minH, nil
561+
}
562+
563+
func pruneCometDBs(dataDir string, height int64) error {
564+
// TODO: This is a hack. In fact even if we manage to get this right via
565+
// BadgerDBProvider and somehow pass correct config via context, this will
566+
// still not be intended way to use it. I believe this hack is worth it, but
567+
// we should definitely release this command as experimental first.
568+
blockstorePath := fmt.Sprintf("%s/consensus/data/blockstore.badger.db", dataDir)
569+
blockDB, err := cometbftBadger.New(blockstorePath, false)
570+
if err != nil {
571+
return fmt.Errorf("failed to open blockstore: %w", err)
572+
}
573+
blockstore := cmtBlockstore.NewBlockStore(blockDB)
574+
575+
logger.Info("pruning consensus blockstore", "target_height", height)
576+
n, err := blockstore.PruneBlocks(height)
577+
if err != nil {
578+
return fmt.Errorf("failed to prune blocks: %w", err)
579+
}
580+
logger.Info("consensus blockstore finished", "pruned", n)
581+
582+
// TODO add pruning of state.badger.db
583+
584+
return nil
418585
}
419586

420587
// Register registers the client sub-command and all of its children.
@@ -425,5 +592,6 @@ func Register(parentCmd *cobra.Command) {
425592
storageCmd.AddCommand(storageCheckCmd)
426593
storageCmd.AddCommand(storageRenameNsCmd)
427594
storageCmd.AddCommand(storageCompactCmd)
595+
storageCmd.AddCommand(pruneCmd)
428596
parentCmd.AddCommand(storageCmd)
429597
}

0 commit comments

Comments
 (0)