Skip to content

Commit 83ba915

Browse files
szymon-miezaldjatnieks
authored andcommitted
HCD-73: Add a jmx endpoint to change the node state in gossip (#1617)
Node crashes during node replacements result in hibernated nodes that cannot join the cluster anymore due to a lack of SYN messages from seeds. Port DB-1482, which allows the use a jmx endpoint on a seed to bring the hibernated node back to the gossiping candidate list. Tested via: datastax/cassandra-dtest#75.
1 parent a02e160 commit 83ba915

File tree

3 files changed

+158
-1
lines changed

3 files changed

+158
-1
lines changed

src/java/org/apache/cassandra/gms/Gossiper.java

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717
*/
1818
package org.apache.cassandra.gms;
1919

20+
import java.io.ByteArrayInputStream;
21+
import java.io.DataInputStream;
22+
import java.io.IOException;
2023
import java.net.UnknownHostException;
2124
import java.util.ArrayList;
2225
import java.util.Arrays;
@@ -67,6 +70,7 @@
6770
import org.apache.cassandra.config.CassandraRelevantProperties;
6871
import org.apache.cassandra.config.DatabaseDescriptor;
6972
import org.apache.cassandra.db.SystemKeyspace;
73+
import org.apache.cassandra.dht.IPartitioner;
7074
import org.apache.cassandra.dht.Token;
7175
import org.apache.cassandra.locator.InetAddressAndPort;
7276
import org.apache.cassandra.net.Message;
@@ -1027,6 +1031,130 @@ else if (newState.getHeartBeatState().getHeartBeatVersion() != heartbeat)
10271031
});
10281032
}
10291033

1034+
public void reviveEndpoint(String address) throws UnknownHostException
1035+
{
1036+
InetAddressAndPort endpoint = InetAddressAndPort.getByName(address);
1037+
EndpointState epState = endpointStateMap.get(endpoint);
1038+
logger.warn("Reviving {} via gossip", endpoint);
1039+
1040+
if (epState == null)
1041+
throw new RuntimeException("Cannot revive endpoint " + endpoint + ": no endpoint-state");
1042+
1043+
int generation = epState.getHeartBeatState().getGeneration();
1044+
int heartbeat = epState.getHeartBeatState().getHeartBeatVersion();
1045+
1046+
logger.info("Have endpoint-state for {}: status={}, generation={}, heartbeat={}",
1047+
endpoint, epState.getStatus(), generation, heartbeat);
1048+
1049+
if (!isSilentShutdownState(epState))
1050+
throw new RuntimeException("Cannot revive endpoint " + endpoint + ": not in a (silent) shutdown state: " + epState.getStatus());
1051+
1052+
if (FailureDetector.instance.isAlive(endpoint))
1053+
throw new RuntimeException("Cannot revive endpoint " + endpoint + ": still alive (failure-detector)");
1054+
1055+
logger.info("Sleeping for {}ms to ensure {} does not change", StorageService.RING_DELAY_MILLIS, endpoint);
1056+
Uninterruptibles.sleepUninterruptibly(StorageService.RING_DELAY_MILLIS, TimeUnit.MILLISECONDS);
1057+
// make sure the endpoint state did not change
1058+
EndpointState newState = endpointStateMap.get(endpoint);
1059+
if (newState == null)
1060+
throw new RuntimeException("Cannot revive endpoint " + endpoint + ": endpoint-state disappeared");
1061+
if (newState.getHeartBeatState().getGeneration() != generation)
1062+
throw new RuntimeException("Cannot revive endpoint " + endpoint + ": still alive, generation changed while trying to reviving it");
1063+
if (newState.getHeartBeatState().getHeartBeatVersion() != heartbeat)
1064+
throw new RuntimeException("Cannot revive endpoint " + endpoint + ": still alive, heartbeat changed while trying to reviving it");
1065+
1066+
epState.updateTimestamp(); // make sure we don't evict it too soon
1067+
epState.getHeartBeatState().forceNewerGenerationUnsafe();
1068+
1069+
// using the tokens from the endpoint-state as that is the real source of truth
1070+
Collection<Token> tokens = getTokensFromEndpointState(epState, DatabaseDescriptor.getPartitioner());
1071+
if (tokens == null || tokens.isEmpty())
1072+
throw new RuntimeException("Cannot revive endpoint " + endpoint + ": no tokens from TokenMetadata");
1073+
1074+
epState.addApplicationState(ApplicationState.STATUS, StorageService.instance.valueFactory.normal(tokens));
1075+
epState.addApplicationState(ApplicationState.STATUS_WITH_PORT, StorageService.instance.valueFactory.normal(tokens));
1076+
handleMajorStateChange(endpoint, epState);
1077+
Uninterruptibles.sleepUninterruptibly(intervalInMillis * 4, TimeUnit.MILLISECONDS);
1078+
logger.warn("Finished reviving {}, status={}, generation={}, heartbeat={}",
1079+
endpoint, epState.getStatus(), generation, heartbeat);
1080+
}
1081+
1082+
public void unsafeSetEndpointState(String address, String status) throws UnknownHostException
1083+
{
1084+
logger.warn("Forcibly changing gossip status of " + address + " to " + status);
1085+
1086+
InetAddressAndPort endpoint = InetAddressAndPort.getByName(address);
1087+
EndpointState epState = endpointStateMap.get(endpoint);
1088+
1089+
if (epState == null)
1090+
throw new RuntimeException("No state for endpoint " + endpoint);
1091+
1092+
int generation = epState.getHeartBeatState().getGeneration();
1093+
int heartbeat = epState.getHeartBeatState().getHeartBeatVersion();
1094+
1095+
logger.info("Have endpoint-state for {}: status={}, generation={}, heartbeat={}",
1096+
endpoint, epState.getStatus(), generation, heartbeat);
1097+
1098+
if (FailureDetector.instance.isAlive(endpoint))
1099+
throw new RuntimeException("Cannot update status for endpoint " + endpoint + ": still alive (failure-detector)");
1100+
1101+
Collection<Token> tokens = getTokensFromEndpointState(epState, DatabaseDescriptor.getPartitioner());
1102+
1103+
VersionedValue newStatus;
1104+
switch (status.toLowerCase())
1105+
{
1106+
case "hibernate":
1107+
newStatus = StorageService.instance.valueFactory.hibernate(true);
1108+
break;
1109+
case "normal":
1110+
newStatus = StorageService.instance.valueFactory.normal(tokens);
1111+
break;
1112+
case "left":
1113+
newStatus = StorageService.instance.valueFactory.left(tokens, computeExpireTime());
1114+
break;
1115+
case "shutdown":
1116+
newStatus = StorageService.instance.valueFactory.shutdown(true);
1117+
break;
1118+
default:
1119+
throw new IllegalArgumentException("Unknown status '" + status + '\'');
1120+
}
1121+
1122+
epState.updateTimestamp(); // make sure we don't evict it too soon
1123+
epState.getHeartBeatState().forceNewerGenerationUnsafe();
1124+
1125+
epState.addApplicationState(ApplicationState.STATUS, newStatus);
1126+
epState.addApplicationState(ApplicationState.STATUS_WITH_PORT, newStatus);
1127+
1128+
handleMajorStateChange(endpoint, epState);
1129+
1130+
logger.warn("Forcibly changed gossip status of " + endpoint + " to " + newStatus);
1131+
}
1132+
1133+
public Collection<Token> getTokensFor(InetAddressAndPort endpoint, IPartitioner partitioner)
1134+
{
1135+
EndpointState state = getEndpointStateForEndpoint(endpoint);
1136+
if (state == null)
1137+
return Collections.emptyList();
1138+
1139+
return getTokensFromEndpointState(state, partitioner);
1140+
}
1141+
1142+
private Collection<Token> getTokensFromEndpointState(EndpointState state, IPartitioner partitioner)
1143+
{
1144+
try
1145+
{
1146+
VersionedValue versionedValue = state.getApplicationState(ApplicationState.TOKENS);
1147+
if (versionedValue == null)
1148+
return Collections.emptyList();
1149+
1150+
return TokenSerializer.deserialize(partitioner, new DataInputStream(new ByteArrayInputStream(versionedValue.toBytes())));
1151+
}
1152+
catch (IOException e)
1153+
{
1154+
throw new RuntimeException(e);
1155+
}
1156+
}
1157+
10301158
public int getCurrentGenerationNumber(InetAddressAndPort endpoint)
10311159
{
10321160
return endpointStateMap.get(endpoint).getHeartBeatState().getGeneration();

src/java/org/apache/cassandra/gms/GossiperMBean.java

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,37 @@ public interface GossiperMBean
2929

3030
public void unsafeAssassinateEndpoint(String address) throws UnknownHostException;
3131

32+
/**
33+
* Do not call this method unless you know what you are doing.
34+
* It will try extremely hard to obliterate any endpoint from the ring,
35+
* even if it does not know about it. Sets gossip status to {@code left}.
36+
*
37+
* @param address endpoint to assassinate
38+
*/
3239
public void assassinateEndpoint(String address) throws UnknownHostException;
3340

41+
/**
42+
* Do not call this method unless you know what you are doing.
43+
* In case a node went into a hibernate state - i.e. replacing a node with the <em>same</em> address
44+
* or bootstrapping a node without letting join the ring - and it's required to bring that node back
45+
* to a normal status (e.g. for a failed replace operation), use this method.
46+
* It can be called on any node, prefer a seed node, to set the status back to {@code normal}.
47+
*
48+
* @param address endpoint to revive
49+
*/
50+
public void reviveEndpoint(String address) throws UnknownHostException;
51+
52+
/**
53+
* Completely unsafe method to set the Gossip status of an endpoint.
54+
* Primary intention is for testing only.
55+
* The method will refuse the request if (and only if) {@link FailureDetector} - no further
56+
* lifetime checks nor gossip state change safety barrier.
57+
*
58+
* @param address endpoint address
59+
* @param status One of {@code hibernate}, {@code normal}, {@code left}, {@code shutdown}
60+
*/
61+
public void unsafeSetEndpointState(String address, String status) throws UnknownHostException;
62+
3463
public List<String> reloadSeeds();
3564

3665
public List<String> getSeeds();

src/java/org/apache/cassandra/service/StorageService.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2954,7 +2954,7 @@ public void setRpcReady(boolean value)
29542954

29552955
private Collection<Token> getTokensFor(InetAddressAndPort endpoint)
29562956
{
2957-
return Nodes.getTokens(endpoint, Collections.emptyList());
2957+
return Gossiper.instance.getTokensFor(endpoint, getTokenMetadata().partitioner);
29582958
}
29592959

29602960
/**

0 commit comments

Comments
 (0)