Skip to content

Commit 76c7bae

Browse files
committed
handle the lost quorum recovering from special config changing during zombie cleanup
1 parent 447b0d2 commit 76c7bae

File tree

4 files changed

+27
-4
lines changed

4 files changed

+27
-4
lines changed

base-kv/base-kv-store-server/src/main/java/com/baidu/bifromq/basekv/store/option/KVRangeOptions.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,12 @@ public class KVRangeOptions {
4646
@Builder.Default
4747
private int statsCollectIntervalSec = 5;
4848
@Builder.Default
49-
private int zombieTimeoutSec = 300; // 5min
49+
private int zombieTimeoutSec = 60; // 1min
5050
@Builder.Default
5151
private RaftConfig walRaftConfig = new RaftConfig()
5252
.setPreVote(true)
53+
.setHeartbeatTimeoutTick(5) // 500ms
5354
.setInstallSnapshotTimeoutTick(6000) // 10min
54-
.setElectionTimeoutTick(30) // 3 sec
55+
.setElectionTimeoutTick(30) // 3s
5556
.setMaxSizePerAppend(100 * 1024 * 1024); // 100MB;
5657
}

base-kv/base-kv-store-server/src/main/java/com/baidu/bifromq/basekv/store/range/KVRangeFSM.java

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
import static com.google.common.collect.Sets.union;
4141
import static java.util.Collections.emptySet;
4242
import static java.util.Collections.singleton;
43+
import static java.util.Collections.singletonList;
4344

4445
import com.baidu.bifromq.base.util.AsyncRunner;
4546
import com.baidu.bifromq.baseenv.EnvProvider;
@@ -162,6 +163,7 @@ public class KVRangeFSM implements IKVRangeFSM {
162163
private final Subject<List<SplitHint>> splitHintsSubject = BehaviorSubject.<List<SplitHint>>create().toSerialized();
163164
private final Subject<Any> factSubject = BehaviorSubject.create();
164165
private final KVRangeOptions opts;
166+
private final AtomicBoolean recovering = new AtomicBoolean();
165167
private final AtomicReference<Lifecycle> lifecycle = new AtomicReference<>(Lifecycle.Init);
166168
private final CompositeDisposable disposables = new CompositeDisposable();
167169
private final CompletableFuture<Void> closeSignal = new CompletableFuture<>();
@@ -1587,8 +1589,19 @@ private void detectZombieState(KVRangeDescriptor descriptor) {
15871589
private void checkZombieState() {
15881590
if (zombieAt > 0
15891591
&& Duration.ofMillis(HLC.INST.getPhysical() - zombieAt).toSeconds() > opts.getZombieTimeoutSec()) {
1590-
log.info("Zombie state detected, send quit signal.");
1591-
quitSignal.complete(null);
1592+
ClusterConfig clusterConfig = wal.latestClusterConfig();
1593+
if (clusterConfig.getNextVotersCount() > 0
1594+
&& clusterConfig.getVotersList().equals(singletonList(hostStoreId))) {
1595+
// recover from single voter change lost quorum
1596+
if (recovering.compareAndSet(false, true)) {
1597+
log.info("Recovering from lost quorum during changing config from single voter: \n{}",
1598+
clusterConfig);
1599+
wal.recover().whenComplete((v, e) -> recovering.set(false));
1600+
}
1601+
} else {
1602+
log.info("Zombie state detected, send quit signal.");
1603+
quitSignal.complete(null);
1604+
}
15921605
}
15931606
}
15941607

base-kv/base-kv-store-server/src/main/java/com/baidu/bifromq/basekv/store/wal/IKVRangeWAL.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import com.baidu.bifromq.basekv.proto.KVRangeSnapshot;
1919
import com.baidu.bifromq.basekv.raft.IRaftNode;
2020
import com.baidu.bifromq.basekv.raft.event.ElectionEvent;
21+
import com.baidu.bifromq.basekv.raft.proto.ClusterConfig;
2122
import com.baidu.bifromq.basekv.raft.proto.LogEntry;
2223
import com.baidu.bifromq.basekv.raft.proto.RaftMessage;
2324
import com.baidu.bifromq.basekv.raft.proto.RaftNodeStatus;
@@ -51,6 +52,8 @@ public interface IKVRangeWAL {
5152

5253
KVRangeSnapshot latestSnapshot();
5354

55+
ClusterConfig latestClusterConfig();
56+
5457
CompletableFuture<Long> propose(KVRangeCommand command);
5558

5659
Observable<Map<String, RaftNodeSyncState>> replicationStatus();

base-kv/base-kv-store-server/src/main/java/com/baidu/bifromq/basekv/store/wal/KVRangeWAL.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import com.baidu.bifromq.basekv.raft.event.SnapshotRestoredEvent;
3030
import com.baidu.bifromq.basekv.raft.event.StatusChangedEvent;
3131
import com.baidu.bifromq.basekv.raft.event.SyncStateChangedEvent;
32+
import com.baidu.bifromq.basekv.raft.proto.ClusterConfig;
3233
import com.baidu.bifromq.basekv.raft.proto.LogEntry;
3334
import com.baidu.bifromq.basekv.raft.proto.RaftMessage;
3435
import com.baidu.bifromq.basekv.raft.proto.RaftNodeStatus;
@@ -209,6 +210,11 @@ public KVRangeSnapshot latestSnapshot() {
209210
return ZeroCopyParser.parse(raftNode.latestSnapshot(), KVRangeSnapshot.parser());
210211
}
211212

213+
@Override
214+
public ClusterConfig latestClusterConfig() {
215+
return raftNode.latestClusterConfig();
216+
}
217+
212218
@Override
213219
public CompletableFuture<Void> compact(KVRangeSnapshot snapshot) {
214220
return raftNode.compact(snapshot.toByteString(), snapshot.getLastAppliedIndex());

0 commit comments

Comments
 (0)