diff --git a/lang/cs/Org.Apache.REEF.Common/AssemblyInfo.cs b/lang/cs/Org.Apache.REEF.Common/AssemblyInfo.cs new file mode 100644 index 0000000000..e60e48f257 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Common/AssemblyInfo.cs @@ -0,0 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System.Runtime.CompilerServices; + +[assembly: InternalsVisibleTo("Org.Apache.REEF.Network")] \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs new file mode 100644 index 0000000000..960227e855 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs @@ -0,0 +1,149 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Globalization; +using System.IO; +using Org.Apache.REEF.Driver; +using Org.Apache.REEF.Tang.Implementations.Configuration; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Client.API; +using Org.Apache.REEF.Client.Local; +using Org.Apache.REEF.Client.Yarn; +using Org.Apache.REEF.Network.Elastic.Config; +using Org.Apache.REEF.Network.Elastic.Driver.Default; + +namespace Org.Apache.REEF.Network.Examples.Client +{ + internal class JobIdentifiers + { + public const string ElastiBroadcast = "ElasticBroadcast"; + public const string ElastiBroadcastWithFailure = "ElasticBroadcastWithFailure"; + } + + public sealed class ElasticBroadcastClient where T : DefaultElasticDriver + { + private const string Local = "local"; + private const string Yarn = "yarn"; + private const string DefaultRuntimeFolder = "REEF_LOCAL_RUNTIME"; + private const string stage = "Broadcast"; + + public ElasticBroadcastClient( + bool runOnYarn, + int numTasks, + int startingPortNo, + int portRange, + string jobIdentifier) + { + string driverId = typeof(T).Name; + JobIdentifier = jobIdentifier; + + IConfiguration driverConfig = TangFactory.GetTang() + .NewConfigurationBuilder(GetDriverConf()) + .BindNamedParameter( + GenericType.Class, + numTasks.ToString(CultureInfo.InvariantCulture)) + .BindNamedParameter( + GenericType.Class, + startingPortNo.ToString(CultureInfo.InvariantCulture)) + .BindNamedParameter( + GenericType.Class, + portRange.ToString(CultureInfo.InvariantCulture)) + .Build(); + + IConfiguration elsticGroupCommServiceDriverConfig = TangFactory.GetTang() + .NewConfigurationBuilder() + .BindStringNamedParam(driverId) + .BindStringNamedParam(stage) + .BindIntNamedParam( + numTasks.ToString(CultureInfo.InvariantCulture)) + .Build(); + + IConfiguration merged = Configurations + .Merge(driverConfig, elsticGroupCommServiceDriverConfig); + + string runPlatform = runOnYarn ? "yarn" : "local"; + + TestRun(merged, typeof(T), numTasks, JobIdentifier, runPlatform); + } + + private static void TestRun( + IConfiguration driverConfig, + Type globalAssemblyType, + int numberOfEvaluator, + string jobIdentifier = "myDriver", + string runOnYarn = "local", + string runtimeFolder = DefaultRuntimeFolder) + { + IInjector injector = TangFactory.GetTang() + .NewInjector(GetRuntimeConfiguration(runOnYarn, numberOfEvaluator, runtimeFolder)); + var reefClient = injector.GetInstance(); + var jobRequestBuilder = injector.GetInstance(); + var jobSubmission = jobRequestBuilder + .AddDriverConfiguration(driverConfig) + .AddGlobalAssemblyForType(globalAssemblyType) + .SetJobIdentifier(jobIdentifier) + .Build(); + + reefClient.SubmitAndGetJobStatus(jobSubmission); + } + + private static IConfiguration GetRuntimeConfiguration( + string runOnYarn, + int numberOfEvaluator, + string runtimeFolder) + { + switch (runOnYarn) + { + case Local: + var dir = Path.Combine(".", runtimeFolder); + return LocalRuntimeClientConfiguration.ConfigurationModule + .Set( + LocalRuntimeClientConfiguration.NumberOfEvaluators, + numberOfEvaluator.ToString()) + .Set(LocalRuntimeClientConfiguration.RuntimeFolder, dir) + .Build(); + + case Yarn: + return YARNClientConfiguration.ConfigurationModule.Build(); + + default: + throw new ArgumentException("Unknown runtime: " + runOnYarn); + } + } + + private string JobIdentifier { get; set; } + + private IConfiguration GetDriverConf() + { + return DriverConfiguration.ConfigurationModule + .Set(DriverConfiguration.OnDriverStarted, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorAllocated, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorFailed, GenericType.Class) + .Set(DriverConfiguration.OnContextActive, GenericType.Class) + .Set(DriverConfiguration.OnTaskRunning, GenericType.Class) + .Set(DriverConfiguration.OnTaskCompleted, GenericType.Class) + .Set(DriverConfiguration.OnTaskFailed, GenericType.Class) + .Set(DriverConfiguration.OnTaskMessage, GenericType.Class) + .Set(DriverConfiguration.CustomTraceLevel, Level.Info.ToString()) + .Build(); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/Org.Apache.REEF.Network.Examples.Client.csproj b/lang/cs/Org.Apache.REEF.Network.Examples.Client/Org.Apache.REEF.Network.Examples.Client.csproj index 42f75ce322..c2db3a7fbf 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples.Client/Org.Apache.REEF.Network.Examples.Client.csproj +++ b/lang/cs/Org.Apache.REEF.Network.Examples.Client/Org.Apache.REEF.Network.Examples.Client.csproj @@ -21,6 +21,7 @@ under the License. Org.Apache.REEF.Network.Examples.Client REEF Network Client examples REEF Examples Network Client + @@ -34,5 +35,5 @@ under the License. - + \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs index 6b989f1057..c4bda166c2 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs @@ -15,22 +15,35 @@ // specific language governing permissions and limitations // under the License. -using System; +using Org.Apache.REEF.Network.Examples.Elastic; using Org.Apache.REEF.Network.Examples.GroupCommunication; +using System; namespace Org.Apache.REEF.Network.Examples.Client { + internal enum TestType + { + PipelineBroadcastAndReduce, + BroadcastAndReduce, + ElasticBroadcast, + ElasticBroadcastWithFailureInConstructor, + ElasticBroadcastWithFailureBeforeWorkflow, + ElasticBroadcastWithFailEvaluatorBeforeWorkflow, + ElasticBroadcastWithFailureBeforeBroadcast, + ElasticBroadcastWithFailureAfterBroadcast, + ElasticBroadcastWithMultipleFailures + } + public class Run { public static void Main(string[] args) { Console.WriteLine("start running client: " + DateTime.Now); bool runOnYarn = false; - int numNodes = 9; + int numNodes = 5; int startPort = 8900; int portRange = 1000; - string testToRun = "RunBroadcastAndReduce"; - testToRun = testToRun.ToLower(); + string testToRun = TestType.ElasticBroadcastWithFailEvaluatorBeforeWorkflow.ToString(); if (args != null) { @@ -56,11 +69,11 @@ public static void Main(string[] args) if (args.Length > 4) { - testToRun = args[4].ToLower(); + testToRun = args[4]; } } - if (testToRun.Equals("RunPipelineBroadcastAndReduce".ToLower()) || testToRun.Equals("all")) + if (TestType.PipelineBroadcastAndReduce.Match(testToRun)) { int arraySize = GroupTestConstants.ArrayLength; int chunkSize = GroupTestConstants.ChunkSize; @@ -71,16 +84,111 @@ public static void Main(string[] args) chunkSize = int.Parse(args[6]); } - new PipelineBroadcastAndReduceClient().RunPipelineBroadcastAndReduce(runOnYarn, numNodes, startPort, - portRange, arraySize, chunkSize); - Console.WriteLine("RunPipelineBroadcastAndReduce completed!!!"); + new PipelineBroadcastAndReduceClient().RunPipelineBroadcastAndReduce( + runOnYarn, + numNodes, + startPort, + portRange, + arraySize, + chunkSize); + Console.WriteLine("PipelineBroadcastAndReduce completed!!!"); + } + + if (TestType.BroadcastAndReduce.Match(testToRun)) + { + new BroadcastAndReduceClient().RunBroadcastAndReduce( + runOnYarn, + numNodes, + startPort, + portRange); + Console.WriteLine("BroadcastAndReduce completed!!!"); + } + + if (TestType.ElasticBroadcast.Match(testToRun)) + { + new ElasticBroadcastClient( + runOnYarn, + numNodes, + startPort, + portRange, + JobIdentifiers.ElastiBroadcast); + Console.WriteLine("ElasticBroadcast completed!!!"); + } + + if (TestType.ElasticBroadcastWithFailureInConstructor.Match(testToRun)) + { + new ElasticBroadcastClient>( + runOnYarn, + numNodes, + startPort, + portRange, + JobIdentifiers.ElastiBroadcastWithFailure); + Console.WriteLine("ElasticBroadcastWithFailureInConstructor completed!!!"); + } + + if (TestType.ElasticBroadcastWithFailureBeforeWorkflow.Match(testToRun)) + { + new ElasticBroadcastClient>( + runOnYarn, + numNodes, + startPort, + portRange, + JobIdentifiers.ElastiBroadcastWithFailure); + Console.WriteLine("ElasticBroadcastWithFailureBeforeWorkflow completed!!!"); + } + + if (TestType.ElasticBroadcastWithFailEvaluatorBeforeWorkflow.Match(testToRun)) + { + new ElasticBroadcastClient>( + runOnYarn, + numNodes, + startPort, + portRange, + JobIdentifiers.ElastiBroadcastWithFailure); + Console.WriteLine("ElasticBroadcastWithFailEvaluatorBeforeWorkflow completed!!!"); + } + + if (TestType.ElasticBroadcastWithFailureBeforeBroadcast.Match(testToRun)) + { + new ElasticBroadcastClient>( + runOnYarn, + numNodes, + startPort, + portRange, + JobIdentifiers.ElastiBroadcastWithFailure); + Console.WriteLine("ElasticBroadcastWithFailureBeforeBroadcast completed!!!"); + } + + if (TestType.ElasticBroadcastWithFailureAfterBroadcast.Match(testToRun)) + { + new ElasticBroadcastClient>( + runOnYarn, + numNodes, + startPort, + portRange, + JobIdentifiers.ElastiBroadcastWithFailure); + Console.WriteLine("ElasticBroadcastWithFailureAfterBroadcast completed!!!"); } - if (testToRun.Equals("RunBroadcastAndReduce".ToLower()) || testToRun.Equals("all")) + if (TestType.ElasticBroadcastWithMultipleFailures.Match(testToRun)) { - new BroadcastAndReduceClient().RunBroadcastAndReduce(runOnYarn, numNodes, startPort, portRange); - Console.WriteLine("RunBroadcastAndReduce completed!!!"); - } + new ElasticBroadcastClient>( + runOnYarn, + numNodes, + startPort, + portRange, + JobIdentifiers.ElastiBroadcastWithFailure); + Console.WriteLine("ElasticBroadcastWithMultipleFailures completed!!!"); + } + } + } + + internal static class TestTypeMatcher + { + public static bool Match(this TestType test, string name) + { + name = name.ToLower(); + return name.Equals("all") || test.ToString().ToLower().Equals(name); } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs new file mode 100644 index 0000000000..4710fca41a --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Utilities.Logging; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + public sealed class BroadcastMasterTask : DefaultElasticTask + { + private static readonly Logger Log = Logger.GetLogger(typeof(BroadcastMasterTask)); + + [Inject] + private BroadcastMasterTask(CancellationSource source, IElasticContext context) + : base(source, context, "Broadcast") + { + } + + private readonly Random _rand = new Random(); + + protected override void Execute(byte[] memento, Workflow workflow) + { + foreach (var op in workflow) + { + switch (op.OperatorType) + { + case OperatorType.Broadcast: + var sender = workflow.Current as IElasticBroadcast; + int number = _rand.Next(); + + sender.Send(number); + + Log.Log(Level.Info, "Master has sent {0}", number); + break; + + default: + throw new InvalidOperationException( + $"Operation {workflow.Current} in workflow not implemented."); + } + } + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastSlaveTask.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastSlaveTask.cs new file mode 100644 index 0000000000..988acba951 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastSlaveTask.cs @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Utilities.Logging; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + public sealed class BroadcastSlaveTask : DefaultElasticTask + { + private static readonly Logger Log = Logger.GetLogger(typeof(BroadcastSlaveTask)); + + [Inject] + public BroadcastSlaveTask(CancellationSource source, IElasticContext context) + : base(source, context, "Broadcast") + { + } + + protected override void Execute(byte[] memento, Workflow workflow) + { + foreach (var op in workflow) + { + switch (op.OperatorType) + { + case OperatorType.Broadcast: + var receiver = workflow.Current as IElasticBroadcast; + + var rec = receiver.Receive(); + + Log.Log(Level.Info, $"Slave has received {rec}"); + break; + + default: + throw new InvalidOperationException( + $"Operation {workflow.Current} in workflow not implemented."); + } + } + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs new file mode 100644 index 0000000000..345feade2f --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Network.Elastic.Driver; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum; +using Org.Apache.REEF.Network.Elastic.Driver.Default; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + /// + /// Example implementation of broadcasting using the elastic group communication service. + /// + public sealed class ElasticBroadcastDriver : DefaultElasticDriver + { + [Inject] + private ElasticBroadcastDriver(IElasticContext context) : base(context) + { + IElasticStage stage = Context.DefaultStage(); + + // Create and build the pipeline + stage.PipelineRoot + .Broadcast(TopologyType.Flat) + .Build(); + + // Build the stage + stage = stage.Build(); + + // Create the task manager, register the stage to the task manager, build the task set manager + TaskSetManager = Context + .CreateNewTaskSetManager(MasterTaskConfiguration, SlaveTaskConfiguration) + .AddStage(stage) + .Build(); + } + + private IConfiguration MasterTaskConfiguration(string taskId) + { + return Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) + .Build(); + } + + private IConfiguration SlaveTaskConfiguration(string taskId) + { + return Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) + .Build(); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieAfterBroadcast.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieAfterBroadcast.cs new file mode 100644 index 0000000000..9e9b07d2c2 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieAfterBroadcast.cs @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Utilities.Logging; +using System; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + public sealed class BroadcastSlaveTaskDieAfterBroadcast : DefaultElasticTask + { + private static readonly Logger Log = Logger.GetLogger( + typeof(BroadcastSlaveTaskDieAfterBroadcast)); + + private readonly string _taskId; + + [Inject] + public BroadcastSlaveTaskDieAfterBroadcast( + [Parameter(typeof(TaskConfigurationOptions.Identifier))] string taskId, + CancellationSource source, IElasticContext context) + : base(source, context, "Broadcast") + { + _taskId = taskId; + } + + protected override void Execute(byte[] memento, Workflow workflow) + { + foreach (var op in workflow) + { + switch (op.OperatorType) + { + case OperatorType.Broadcast: + + var receiver = workflow.Current as IElasticBroadcast; + + var rec = receiver.Receive(); + + Log.Log(Level.Info, "Slave has received {0}", rec); + + if (Utils.GetTaskNum(_taskId) == 2) + { + throw new Exception("Die after broadcast."); + } + + break; + + default: + break; + } + } + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeBroadcast.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeBroadcast.cs new file mode 100644 index 0000000000..8a5e4245be --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeBroadcast.cs @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Utilities.Logging; +using System; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + public sealed class BroadcastSlaveTaskDieBeforeBroadcast : DefaultElasticTask + { + private static readonly Logger Log = Logger.GetLogger( + typeof(BroadcastSlaveTaskDieBeforeBroadcast)); + + private readonly string _taskId; + + [Inject] + public BroadcastSlaveTaskDieBeforeBroadcast( + [Parameter(typeof(TaskConfigurationOptions.Identifier))] string taskId, + CancellationSource source, IElasticContext context) + : base(source, context, "Broadcast") + { + _taskId = taskId; + } + + protected override void Execute(byte[] memento, Workflow workflow) + { + foreach (var op in workflow) + { + switch (op.OperatorType) + { + case OperatorType.Broadcast: + + if (Utils.GetTaskNum(_taskId) == 2) + { + throw new Exception("Die before broadcast."); + } + + var receiver = workflow.Current as IElasticBroadcast; + + var rec = receiver.Receive(); + + Log.Log(Level.Info, "Slave has received {0}", rec); + break; + + default: + break; + } + } + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeWorkflow.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeWorkflow.cs new file mode 100644 index 0000000000..aeeda3eb68 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeWorkflow.cs @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic; +using Org.Apache.REEF.Utilities.Logging; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + public sealed class BroadcastSlaveTaskDieBeforeWorkflow : DefaultElasticTask + { + private static readonly Logger Log = Logger.GetLogger( + typeof(BroadcastSlaveTaskDieBeforeWorkflow)); + + private readonly string _taskId; + + [Inject] + public BroadcastSlaveTaskDieBeforeWorkflow( + [Parameter(typeof(TaskConfigurationOptions.Identifier))] string taskId, + CancellationSource source, IElasticContext context) + : base(source, context, "Broadcast") + { + _taskId = taskId; + } + + protected override void Execute(byte[] memento, Workflow workflow) + { + if (Utils.GetTaskNum(_taskId) == 2) + { + throw new Exception("Die before workflow."); + } + + foreach (var op in workflow) + { + switch (op.OperatorType) + { + case OperatorType.Broadcast: + var receiver = workflow.Current as IElasticBroadcast; + + var rec = receiver.Receive(); + + Log.Log(Level.Info, "Slave has received {0}", rec); + break; + + default: + break; + } + } + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieEvaluatorBeforeWorkflow.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieEvaluatorBeforeWorkflow.cs new file mode 100644 index 0000000000..7f30b6fdb5 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieEvaluatorBeforeWorkflow.cs @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic; +using Org.Apache.REEF.Utilities.Logging; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + public sealed class BroadcastSlaveTaskDieEvaluatorBeforeWorkflow : DefaultElasticTask + { + private static readonly Logger Log = Logger.GetLogger( + typeof(BroadcastSlaveTaskDieEvaluatorBeforeWorkflow)); + + private readonly string _taskId; + + [Inject] + public BroadcastSlaveTaskDieEvaluatorBeforeWorkflow( + [Parameter(typeof(TaskConfigurationOptions.Identifier))] string taskId, + CancellationSource source, IElasticContext context) + : base(source, context, "Broadcast") + { + _taskId = taskId; + } + + protected override void Execute(byte[] memento, Workflow workflow) + { + if (Utils.GetTaskNum(_taskId) == 2) + { + Console.WriteLine("Die before workflow."); + Environment.Exit(0); + } + + foreach (var op in workflow) + { + switch (op.OperatorType) + { + case OperatorType.Broadcast: + var receiver = workflow.Current as IElasticBroadcast; + + var rec = receiver.Receive(); + + Log.Log(Level.Info, "Slave has received {0}", rec); + break; + + default: + break; + } + } + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieInConstructor.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieInConstructor.cs new file mode 100644 index 0000000000..323ebf4359 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieInConstructor.cs @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic; +using Org.Apache.REEF.Utilities.Logging; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + public sealed class BroadcastSlaveTaskDieInConstructor : DefaultElasticTask + { + private static readonly Logger Log = Logger.GetLogger( + typeof(BroadcastSlaveTaskDieInConstructor)); + + [Inject] + public BroadcastSlaveTaskDieInConstructor( + [Parameter(typeof(TaskConfigurationOptions.Identifier))] string taskId, + CancellationSource source, IElasticContext context) + : base(source, context, "Broadcast") + { + if (Utils.GetTaskNum(taskId) == 2) + { + throw new Exception("Die in Constructor."); + } + } + + protected override void Execute(byte[] memento, Workflow workflow) + { + foreach (var op in workflow) + { + switch (op.OperatorType) + { + case OperatorType.Broadcast: + var receiver = workflow.Current as IElasticBroadcast; + + var rec = receiver.Receive(); + + Log.Log(Level.Info, "Slave has received {0}", rec); + break; + + default: + break; + } + } + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs new file mode 100644 index 0000000000..4f36b54ea1 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Utilities.Logging; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + public sealed class BroadcastSlaveTaskDieMultiple : DefaultElasticTask + { + private static readonly Logger Log = Logger.GetLogger( + typeof(BroadcastSlaveTaskDieMultiple)); + + private const int _failProb = 70; + private readonly Random _rand = new Random(); + + [Inject] + public BroadcastSlaveTaskDieMultiple( + CancellationSource source, IElasticContext context) + : base(source, context, "Broadcast") + { + } + + protected override void Execute(byte[] memento, Workflow workflow) + { + if (_rand.Next(100) < _failProb) + { + throw new Exception("Die."); + } + + foreach (var op in workflow) + { + switch (op.OperatorType) + { + case OperatorType.Broadcast: + + if (_rand.Next(100) < _failProb) + { + throw new Exception("Die"); + } + + var receiver = workflow.Current as IElasticBroadcast; + + var rec = receiver.Receive(); + + Log.Log(Level.Info, "Slave has received {0}", rec); + + break; + + default: + break; + } + } + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultipleEvaluators.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultipleEvaluators.cs new file mode 100644 index 0000000000..1032e2670a --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultipleEvaluators.cs @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Utilities.Logging; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + public sealed class BroadcastSlaveTaskDieMultipleEvaluators : DefaultElasticTask + { + private static readonly Logger Log = Logger.GetLogger( + typeof(BroadcastSlaveTaskDieMultipleEvaluators)); + + private const int _failProb = 50; + private readonly Random _rand = new Random(); + + [Inject] + public BroadcastSlaveTaskDieMultipleEvaluators( + CancellationSource source, IElasticContext context) + : base(source, context, "Broadcast") + { + } + + protected override void Execute(byte[] memento, Workflow workflow) + { + if (_rand.Next(100) < _failProb) + { + Environment.Exit(0); + } + + foreach (var op in workflow) + { + switch (op.OperatorType) + { + case OperatorType.Broadcast: + + if (_rand.Next(100) < _failProb) + { + Environment.Exit(0); + } + + var receiver = workflow.Current as IElasticBroadcast; + + var rec = receiver.Receive(); + + Log.Log(Level.Info, "Slave has received {0}", rec); + + if (_rand.Next(100) < _failProb) + { + Environment.Exit(0); + } + break; + + default: + break; + } + } + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs new file mode 100644 index 0000000000..521457137d --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Network.Elastic.Driver; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum; +using Org.Apache.REEF.Network.Elastic.Driver.Default; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Network.Elastic.Failures.Default; +using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Network.Elastic.Config; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + /// + /// Example implementation of broadcasting using the elastic group communication service. + /// + public sealed class ElasticBroadcastDriverWithFailures + : DefaultElasticDriver + where TSlave : DefaultElasticTask + { + [Inject] + private ElasticBroadcastDriverWithFailures( + [Parameter(typeof(ElasticServiceConfigurationOptions.DefaultStageName))] string stageName, + [Parameter(typeof(ElasticServiceConfigurationOptions.NumEvaluators))] int numEvaluators, + IElasticContext context) : base(context) + { + IFailureStateMachine failureMachine = new DefaultFailureStateMachine(); + + failureMachine.SetThresholds( + DefaultFailureState.Threshold(DefaultFailureStates.ContinueAndReconfigure, 0.01F), + DefaultFailureState.Threshold(DefaultFailureStates.ContinueAndReschedule, 0.40F), + DefaultFailureState.Threshold(DefaultFailureStates.StopAndReschedule, 0.60F), + DefaultFailureState.Threshold(DefaultFailureStates.Fail, 0.80F)); + + IElasticStage stage = Context.CreateNewStage(stageName, numEvaluators, failureMachine); + + // Create and build the pipeline + stage.PipelineRoot + .Broadcast(TopologyType.Flat) + .Build(); + + // Build the stage + stage = stage.Build(); + + // Create the task manager, register the stage to the task manager, build the task set manager + TaskSetManager = Context + .CreateNewTaskSetManager(MasterTaskConfiguration, SlaveTaskConfiguration) + .AddStage(stage) + .Build(); + } + + private IConfiguration MasterTaskConfiguration(string taskId) + { + return Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) + .Build(); + } + + private IConfiguration SlaveTaskConfiguration(string taskId) + { + return Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) + .Build(); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/GroupCommunication/ScatterReduceDriverAndTasks/ScatterReduceDriver.cs b/lang/cs/Org.Apache.REEF.Network.Examples/GroupCommunication/ScatterReduceDriverAndTasks/ScatterReduceDriver.cs index db82defd30..430e56503d 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/GroupCommunication/ScatterReduceDriverAndTasks/ScatterReduceDriver.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/GroupCommunication/ScatterReduceDriverAndTasks/ScatterReduceDriver.cs @@ -37,10 +37,10 @@ namespace Org.Apache.REEF.Network.Examples.GroupCommunication.ScatterReduceDriverAndTasks { - public class ScatterReduceDriver : - IObserver, - IObserver, - IObserver, + public class ScatterReduceDriver : + IObserver, + IObserver, + IObserver, IObserver { private static readonly Logger LOGGER = Logger.GetLogger(typeof(ScatterReduceDriver)); @@ -79,12 +79,12 @@ public ScatterReduceDriver( .AddScatter( GroupTestConstants.ScatterOperatorName, GroupTestConstants.MasterTaskId, - TopologyTypes.Tree, + TopologyTypes.Tree, dataConverterConfig) .AddReduce( GroupTestConstants.ReduceOperatorName, GroupTestConstants.MasterTaskId, - TopologyTypes.Tree, + TopologyTypes.Tree, reduceFunctionConfig, dataConverterConfig) @@ -142,7 +142,7 @@ public void OnNext(IDriverStarted value) .SetMegabytes(512) .SetCores(2) .SetRackName("WonderlandRack") - .SetEvaluatorBatchId("BroadcastEvaluator").Build(); + .SetEvaluatorBatchId("BroadcastEvaluator").Build(); _evaluatorRequestor.Submit(request); } @@ -168,4 +168,4 @@ public int Reduce(IEnumerable elements) } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Org.Apache.REEF.Network.Examples.csproj b/lang/cs/Org.Apache.REEF.Network.Examples/Org.Apache.REEF.Network.Examples.csproj index 0855e6d076..30ad4f6842 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Org.Apache.REEF.Network.Examples.csproj +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Org.Apache.REEF.Network.Examples.csproj @@ -24,6 +24,7 @@ under the License. + @@ -31,6 +32,6 @@ under the License. - + diff --git a/lang/cs/Org.Apache.REEF.Network.Tests/GroupCommunication/GroupCommunicationTests.cs b/lang/cs/Org.Apache.REEF.Network.Tests/GroupCommunication/GroupCommunicationTests.cs index 2f5fe8ac97..83eea0f594 100644 --- a/lang/cs/Org.Apache.REEF.Network.Tests/GroupCommunication/GroupCommunicationTests.cs +++ b/lang/cs/Org.Apache.REEF.Network.Tests/GroupCommunication/GroupCommunicationTests.cs @@ -69,9 +69,9 @@ public void TestSender() new BlockingCollection(); var handler1 = - Observer.Create>(msg => messages1.Add(msg.Data.First())); + Observer.Create>(msg => messages1.Add(msg.Data)); var handler2 = - Observer.Create>(msg => messages2.Add(msg.Data.First())); + Observer.Create>(msg => messages2.Add(msg.Data)); var networkServiceInjector1 = BuildNetworkServiceInjector(endpoint, handler1); var networkServiceInjector2 = BuildNetworkServiceInjector(endpoint, handler2); diff --git a/lang/cs/Org.Apache.REEF.Network.Tests/NetworkService/NetworkServiceTests.cs b/lang/cs/Org.Apache.REEF.Network.Tests/NetworkService/NetworkServiceTests.cs index d62e9aff65..2efded01b9 100644 --- a/lang/cs/Org.Apache.REEF.Network.Tests/NetworkService/NetworkServiceTests.cs +++ b/lang/cs/Org.Apache.REEF.Network.Tests/NetworkService/NetworkServiceTests.cs @@ -172,7 +172,7 @@ public MessageHandler(BlockingCollection queue) public void OnNext(NsMessage value) { - _queue.Add(value.Data.First()); + _queue.Add(value.Data); } public void OnError(Exception error) diff --git a/lang/cs/Org.Apache.REEF.Network.Tests/NetworkService/StreamingNetworkServiceTests.cs b/lang/cs/Org.Apache.REEF.Network.Tests/NetworkService/StreamingNetworkServiceTests.cs index 691ecbf864..8f59104363 100644 --- a/lang/cs/Org.Apache.REEF.Network.Tests/NetworkService/StreamingNetworkServiceTests.cs +++ b/lang/cs/Org.Apache.REEF.Network.Tests/NetworkService/StreamingNetworkServiceTests.cs @@ -319,7 +319,7 @@ private MessageHandler() public void OnNext(NsMessage value) { - _queue.Add(value.Data.First()); + _queue.Add(value.Data); } public void OnError(Exception error) diff --git a/lang/cs/Org.Apache.REEF.Network.Tests/Org.Apache.REEF.Network.Tests.csproj b/lang/cs/Org.Apache.REEF.Network.Tests/Org.Apache.REEF.Network.Tests.csproj index 2cd937f690..1352026b20 100644 --- a/lang/cs/Org.Apache.REEF.Network.Tests/Org.Apache.REEF.Network.Tests.csproj +++ b/lang/cs/Org.Apache.REEF.Network.Tests/Org.Apache.REEF.Network.Tests.csproj @@ -24,6 +24,7 @@ under the License. + diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Enum/TaskMessageType.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Enum/TaskMessageType.cs new file mode 100644 index 0000000000..a55f945383 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Enum/TaskMessageType.cs @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Enum +{ + /// + /// Supported type of messages between task and driver. + /// + [Unstable("0.16", "Types may change")] + internal enum TaskMessageType : ushort + { + JoinTopology = 0, + + TopologyUpdateRequest = 1, + + CompleteStage = 2 + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/IElasticDriverMessage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/IElasticDriverMessage.cs new file mode 100644 index 0000000000..857b360de8 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/IElasticDriverMessage.cs @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Comm +{ + /// + /// Message sent by the driver to operators on running tasks. + /// This message contains instructions from the driver to tasks's operators. + /// + [Unstable("0.16", "API may change")] + public interface IElasticDriverMessage + { + /// + /// The destination task of the message. + string Destination { get; } + + /// + /// Operator and event specific payload of the message. + /// + DriverMessagePayload Message { get; } + + /// + /// Utility method to serialize the message for communication over the network. + /// + /// The serialized message + byte[] Serialize(); + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs new file mode 100644 index 0000000000..1f667a667a --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Comm +{ + /// + /// Used to propagate task reponses through operators and stages. + /// + [Unstable("0.16", "API may change")] + public interface ITaskMessageResponse + { + /// + /// Method triggered when a task to driver message is received. + /// + /// The task message for the operator + /// If the message cannot be handled correctly or + /// generate an incorrect state + IEnumerable OnTaskMessage(ITaskMessage message); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITypedMessage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITypedMessage.cs new file mode 100644 index 0000000000..aaa374c40c --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITypedMessage.cs @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +namespace Org.Apache.REEF.Network.Elastic.Comm +{ + /// + /// Typed interface for data messages. + /// This is used to provide a unified interface over the + /// different types of data messages. + /// + /// The ty + internal interface ITypedDataMessage + { + /// + /// The data contained in the message. + /// + T Data { get; } + + /// + /// The iteration number for the message. + /// + int Iteration { get; } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessage.cs new file mode 100644 index 0000000000..fcde882313 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessage.cs @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Untyped data message sent by group communicationoOperators. This is the class inherited by + /// GroupCommunicationMessage but seen by the Network Service. + /// DataMessages are untyped and used to semplify message propapagation through the + /// communication layers that are type-agnostic. + /// + [Unstable("0.16", "API may change")] + internal abstract class DataMessage : ElasticGroupCommunicationMessage + { + /// + /// Constructor for an untyped data message. + /// + /// The name of the stage for the message + /// The operator sending the message + /// The iteration in which the message is sent/valid + public DataMessage(string stageName, int operatorId, int iteration) + : base(stageName, operatorId) + { + Iteration = iteration; + } + + /// + /// The iteration number for the message. + /// + public int Iteration { get; set; } + + /// + /// Clone the message. + /// + override public object Clone() + { + // The assumption is that messages are immutable therefore there is no need to clone them + return this; + } + } + + /// + /// A typed data message. + /// + /// The type for the data message + [Unstable("0.16", "API may change")] + internal sealed class DataMessage : DataMessage, ITypedDataMessage + { + /// + /// Constructor of a typed data message. + /// + /// The name of the stage for the message + /// The operator sending the message + /// The iteration in which the message is sent/valid + /// The data contained in the message + public DataMessage( + string stageName, + int operatorId, + int iteration, //// For the moment we consider iterations as ints. Maybe this would change in the future + T data) : base(stageName, operatorId, iteration) + { + Data = data; + } + + /// + /// The data contained in the message. + /// + public T Data { get; set; } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageStreamingCodec.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageStreamingCodec.cs new file mode 100644 index 0000000000..dab643ac0d --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageStreamingCodec.cs @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Threading; +using System.Threading.Tasks; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Wake.Remote; +using Org.Apache.REEF.Wake.StreamingCodec; +using Org.Apache.REEF.Utilities; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Streaming codec for the data message. + /// + internal sealed class DataMessageStreamingCodec : IStreamingCodec> + { + private readonly IStreamingCodec _codec; + + /// + /// Empty constructor to allow instantiation by reflection + /// + [Inject] + private DataMessageStreamingCodec(IStreamingCodec codec) + { + _codec = codec; + } + + /// + /// Read the class fields. + /// + /// The reader from which to read + /// The data message + public DataMessage Read(IDataReader reader) + { + int metadataSize = reader.ReadInt32() + sizeof(int) + sizeof(int); + byte[] metadata = new byte[metadataSize]; + reader.Read(ref metadata, 0, metadataSize); + var (stageName, operatorId, iteration) = GenerateMetaDataDecoding(metadata, metadataSize - sizeof(int) - sizeof(int)); + var data = _codec.Read(reader); + + return new DataMessage(stageName, operatorId, iteration, data); + } + + /// + /// Writes the class fields. + /// + /// The message to write + /// The writer to which to write + public void Write(DataMessage obj, IDataWriter writer) + { + byte[] encodedMetadata = GenerateMetaDataEncoding(obj); + + writer.Write(encodedMetadata, 0, encodedMetadata.Length); + + _codec.Write(obj.Data, writer); + } + + /// + /// Read the class fields. + /// + /// The reader from which to read + /// The cancellation token + /// The data message + public async Task> ReadAsync(IDataReader reader, + CancellationToken token) + { + int metadataSize = reader.ReadInt32() + sizeof(int) + sizeof(int); + byte[] metadata = new byte[metadataSize]; + await reader.ReadAsync(metadata, 0, metadataSize, token); + var (stageName, operatorId, iteration) = GenerateMetaDataDecoding(metadata, metadataSize - sizeof(int) - sizeof(int)); + var data = await _codec.ReadAsync(reader, token); + + return new DataMessage(stageName, operatorId, iteration, data); + } + + /// + /// Writes the class fields. + /// + /// The message to write + /// The writer to which to write + /// The cancellation token + public async System.Threading.Tasks.Task WriteAsync(DataMessage obj, IDataWriter writer, CancellationToken token) + { + byte[] encodedMetadata = GenerateMetaDataEncoding(obj); + + await writer.WriteAsync(encodedMetadata, 0, encodedMetadata.Length, token); + + await _codec.WriteAsync(obj.Data, writer, token); + } + + private static byte[] GenerateMetaDataEncoding(DataMessage obj) + { + byte[] stageBytes = ByteUtilities.StringToByteArrays(obj.StageName); + var length = stageBytes.Length; + byte[] metadataBytes = new byte[sizeof(int) + length + sizeof(int) + sizeof(int)]; + int offset = 0; + + Buffer.BlockCopy(BitConverter.GetBytes(length), 0, metadataBytes, offset, sizeof(int)); + offset += sizeof(int); + + Buffer.BlockCopy(stageBytes, 0, metadataBytes, offset, length); + offset += length; + + Buffer.BlockCopy(BitConverter.GetBytes(obj.OperatorId), 0, metadataBytes, offset, sizeof(int)); + offset += sizeof(int); + + Buffer.BlockCopy(BitConverter.GetBytes(obj.Iteration), 0, metadataBytes, offset, sizeof(int)); + + return metadataBytes; + } + + private static (string stageName, int operatorId, int iteration) GenerateMetaDataDecoding(byte[] obj, int stageLength) + { + int offset = 0; + string stageName = ByteUtilities.ByteArraysToString(obj, offset, stageLength); + offset += stageLength; + + int operatorId = BitConverter.ToInt32(obj, offset); + offset += sizeof(int); + + int iteration = BitConverter.ToInt32(obj, offset); + + return (stageName, operatorId, iteration); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageWithTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageWithTopology.cs new file mode 100644 index 0000000000..95c13836a6 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageWithTopology.cs @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// In this message data and topology update information are sent together. + /// This message is untyped and used to semplify message propapagation through the + /// communication layers that are type-agnostic. + /// + [Unstable("0.16", "API may change")] + internal abstract class DataMessageWithTopology : DataMessage + { + /// + /// Constructor for the base untyped data message with topology. + /// + /// The name of the stage for the message + /// The operator sending the message + /// The iteration in which the message is sent/valid + public DataMessageWithTopology(string stageName, int operatorId, int iteration) + : base(stageName, operatorId, iteration) + { + } + + /// + /// Some topology updates piggybacked to the main data message. + /// + internal List TopologyUpdates { get; set; } + } + + /// + /// Typed version for DataMessageWithTopology. This classis used at the communication entry-points. + /// + /// + [Unstable("0.16", "API may change")] + internal class DataMessageWithTopology : DataMessageWithTopology, ITypedDataMessage + { + /// + /// Main constructor for data messages with topology information. + /// + /// The name of the stage for the message + /// The operator sending the message + /// The iteration in which the message is sent/valid + /// The data contained in the message + /// The topology updates being transmitted with the data + public DataMessageWithTopology( + string stageName, + int operatorId, + int iteration, + T data, + List updates) : base(stageName, operatorId, iteration) + { + Data = data; + TopologyUpdates = updates; + } + + /// + /// Constructor for a data message with topology but without topology updates. + /// + /// The name of the stage for the message + /// The operator sending the message + /// The iteration in which the message is sent/valid + /// The data contained in the message + public DataMessageWithTopology( + string stageName, + int operatorId, + int iteration, + T data) : this(stageName, operatorId, iteration, data, new List()) + { + } + + /// + /// The data contained in the message. + /// + public T Data { get; set; } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageWithTopologyStreamingCodec.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageWithTopologyStreamingCodec.cs new file mode 100644 index 0000000000..2c27e44462 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageWithTopologyStreamingCodec.cs @@ -0,0 +1,160 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Wake.Remote; +using Org.Apache.REEF.Wake.StreamingCodec; +using Org.Apache.REEF.Utilities; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Streaming Codec for the Group Communication Message + /// + internal sealed class DataMessageWithTopologyStreamingCodec : IStreamingCodec> + { + private readonly IStreamingCodec _codec; + + /// + /// Empty constructor to allow instantiation by reflection + /// + [Inject] + private DataMessageWithTopologyStreamingCodec(IStreamingCodec codec) + { + _codec = codec; + } + + /// + /// Read the class fields. + /// + /// The reader from which to read + /// The Group Communication Message + public DataMessageWithTopology Read(IDataReader reader) + { + int metadataSize = reader.ReadInt32() + sizeof(int) + sizeof(int); + byte[] metadata = new byte[metadataSize]; + reader.Read(ref metadata, 0, metadataSize); + var (stageName, operatorId, iteration, updates) = MetaDataDecoding(metadata); + var data = _codec.Read(reader); + + return new DataMessageWithTopology(stageName, operatorId, iteration, data); + } + + /// + /// Writes the class fields. + /// + /// The message to write + /// The writer to which to write + public void Write(DataMessageWithTopology obj, IDataWriter writer) + { + byte[] encodedMetadata = MetaDataEncoding(obj); + + writer.Write(encodedMetadata, 0, encodedMetadata.Length); + + _codec.Write(obj.Data, writer); + } + + /// + /// Read the class fields. + /// + /// The reader from which to read + /// The cancellation token + /// The Group Communication Message + public async Task> ReadAsync(IDataReader reader, + CancellationToken token) + { + int metadataSize = await reader.ReadInt32Async(token); + byte[] metadata = new byte[metadataSize]; + await reader.ReadAsync(metadata, 0, metadataSize, token); + var (stageName, operatorId, iteration, updates) = MetaDataDecoding(metadata); + var data = await _codec.ReadAsync(reader, token); + + return new DataMessageWithTopology(stageName, operatorId, iteration, data, updates); + } + + /// + /// Writes the class fields. + /// + /// The message to write + /// The writer to which to write + /// The cancellation token + public async System.Threading.Tasks.Task WriteAsync(DataMessageWithTopology obj, IDataWriter writer, CancellationToken token) + { + byte[] encodedMetadata = MetaDataEncoding(obj); + + await writer.WriteAsync(BitConverter.GetBytes(encodedMetadata.Length), 0, sizeof(int), token); + await writer.WriteAsync(encodedMetadata, 0, encodedMetadata.Length, token); + + await _codec.WriteAsync(obj.Data, writer, token); + } + + private static byte[] MetaDataEncoding(DataMessageWithTopology obj) + { + byte[] stageBytes = ByteUtilities.StringToByteArrays(obj.StageName); + var totalLengthUpdates = obj.TopologyUpdates.Sum(x => x.Size); + byte[] buffer = new byte[sizeof(int) + totalLengthUpdates + sizeof(int) + stageBytes.Length + sizeof(bool) + sizeof(int) + sizeof(int)]; + int offset = 0; + + Buffer.BlockCopy(BitConverter.GetBytes(stageBytes.Length), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + + Buffer.BlockCopy(stageBytes, 0, buffer, offset, stageBytes.Length); + offset += stageBytes.Length; + + Buffer.BlockCopy(BitConverter.GetBytes(obj.OperatorId), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + + Buffer.BlockCopy(BitConverter.GetBytes(obj.Iteration), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + + Buffer.BlockCopy(BitConverter.GetBytes(totalLengthUpdates), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + + TopologyUpdate.Serialize(buffer, ref offset, obj.TopologyUpdates); + + return buffer; + } + + private static (string stageName, int operatorId, int iteration, List updates) MetaDataDecoding(byte[] obj) + { + int offset = 0; + int stageLength = BitConverter.ToInt32(obj, offset); + offset += sizeof(int); + + string stageName = ByteUtilities.ByteArraysToString(obj, offset, stageLength); + offset += stageLength; + + int operatorId = BitConverter.ToInt32(obj, offset); + offset += sizeof(int); + + int iteration = BitConverter.ToInt32(obj, offset); + offset += sizeof(int); + + int length = BitConverter.ToInt32(obj, offset); + offset += sizeof(int); + + var updates = TopologyUpdate.Deserialize(obj, length, offset); + + return (stageName, operatorId, iteration, updates); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DriverMessagePayload.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DriverMessagePayload.cs new file mode 100644 index 0000000000..553876cd05 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DriverMessagePayload.cs @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Comm +{ + /// + /// Payload for messages going from the driver to tasks. + /// + [Unstable("0.16", "API may change")] + public abstract class DriverMessagePayload : ElasticGroupCommunicationMessage + { + /// + /// Construct a payload for messages created at the driver and directed to tasks. + /// + /// The name of the subsription + /// The id of the operator within the stage + /// The iteration number in which the message is sent + public DriverMessagePayload(string stageName, int operatorId, int iteration) + : base(stageName, operatorId) + { + Iteration = iteration; + } + + /// + /// The type of payload. + /// + internal DriverMessagePayloadType PayloadType { get; set; } + + /// + /// The iteration number in which the message is sent. + /// + internal int Iteration { get; private set; } + + /// + /// Utility method to serialize the payload for communication. + /// + /// The serialized payload + internal abstract byte[] Serialize(); + } + + /// + /// Possible types of driver message payloads. + /// + [Unstable("0.16", "Types may change")] + internal enum DriverMessagePayloadType : ushort + { + Ring = 1, + + Resume = 2, + + Update = 3, // This is a topology message update + + Failure = 4 // This is a topology message update + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticDriverMessageImpl.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticDriverMessageImpl.cs new file mode 100644 index 0000000000..899b2aac9c --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticDriverMessageImpl.cs @@ -0,0 +1,104 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Utilities.Attributes; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Message sent by the driver to operators on running tasks. + /// This message contains instructions from the driver to tasks's operators. + /// + [Unstable("0.16", "API may change")] + internal sealed class ElasticDriverMessageImpl : IElasticDriverMessage + { + /// + /// Create a new driver message. + /// + /// The message destination task + /// The message + public ElasticDriverMessageImpl( + string destinationTaskId, + DriverMessagePayload message) + { + Destination = destinationTaskId; + Message = message; + } + + /// + /// The destination task of the message. + public string Destination { get; } + + /// + /// Operator and event specific payload of the message. + /// + public DriverMessagePayload Message { get; } + + /// + /// Utility method to serialize the message for communication over the network. + /// + /// The serialized message + public byte[] Serialize() + { + List buffer = new List(); + + var destinationBytes = ByteUtilities.StringToByteArrays(Destination); + buffer.AddRange(BitConverter.GetBytes(destinationBytes.Length)); + buffer.AddRange(destinationBytes); + buffer.AddRange(BitConverter.GetBytes((short)Message.PayloadType)); + buffer.AddRange(Message.Serialize()); + + return buffer.ToArray(); + } + + /// + /// Creates a driver message payload out of the memory buffer. + /// + /// The buffer containing a serialized message payload + /// The offset where to start the deserialization process + /// A topology message payload + public static ElasticDriverMessageImpl From(byte[] data, int offset = 0) + { + int destinationLength = BitConverter.ToInt32(data, offset); + offset = 4; + string destination = ByteUtilities.ByteArraysToString(data.Skip(offset).Take(destinationLength).ToArray()); + offset += destinationLength; + + DriverMessagePayloadType type = (DriverMessagePayloadType)BitConverter.ToUInt16(data, offset); + offset += sizeof(ushort); + + DriverMessagePayload payload = null; + + switch (type) + { + case DriverMessagePayloadType.Update: + case DriverMessagePayloadType.Failure: + payload = TopologyMessagePayload.From(type, data, offset); + break; + default: + throw new IllegalStateException("Message type not recognized"); + } + + return new ElasticDriverMessageImpl(destination, payload); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticGroupCommunicationMessage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticGroupCommunicationMessage.cs new file mode 100644 index 0000000000..700fcf591d --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticGroupCommunicationMessage.cs @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Utilities.Attributes; +using System; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Message sent by Group Communication operators. + /// + [Unstable("0.16", "API may change")] + public abstract class ElasticGroupCommunicationMessage : ICloneable, INodeIdentifier + { + /// + /// Create a new elastic group communication message. + /// + /// The name of the stage + /// The id of the operator sending the message + protected ElasticGroupCommunicationMessage( + string stageName, + int operatorId) + { + StageName = stageName; + OperatorId = operatorId; + } + + /// + /// Clone the message. + /// + public abstract object Clone(); + + /// + /// Returns the stage. + public string StageName { get; } + + /// + /// Returns the operator id. + /// + public int OperatorId { get; } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/FailureMessagePayload.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/FailureMessagePayload.cs new file mode 100644 index 0000000000..24b92c69f3 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/FailureMessagePayload.cs @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Messages sent by the driver to operators. + /// This message contains information for the destination node on the topology. + /// + [Unstable("0.16", "API may change")] + internal sealed class FailureMessagePayload : TopologyMessagePayload + { + /// + /// Create a driver message payload containing topology updates + /// + /// The topology updates + /// Whether the updates are additions to the current topology state or nodes removal + /// The stage context for the message + /// The id of the operator receiving the topology update + /// The iteration in which the update takes effect + public FailureMessagePayload(IEnumerable updates, string stageName, int operatorId, int iteration) + : base(DriverMessagePayloadType.Failure, updates, stageName, operatorId, iteration) + { + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyMessagePayload.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyMessagePayload.cs new file mode 100644 index 0000000000..d70151ea4d --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyMessagePayload.cs @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Utilities.Attributes; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Messages sent by the driver to operators. + /// This message contains information for the destination node on the topology. + /// + [Unstable("0.16", "API may change")] + internal class TopologyMessagePayload : DriverMessagePayload + { + /// + /// Create a driver message payload containing topology updates. + /// + /// The topology updates + /// The stage context for the message + /// The id of the operator receiving the topology update + /// The iteration in which the update takes effect + public TopologyMessagePayload( + DriverMessagePayloadType type, + IEnumerable updates, + string stageName, + int operatorId, + int iteration) + : base(stageName, operatorId, iteration) + { + PayloadType = type; + TopologyUpdates = updates.ToList(); + } + + /// + /// Clone the message. + /// + /// An object containing the shallow copy of the message. + public override object Clone() + { + var updatesClone = TopologyUpdates.Select(up => (TopologyUpdate)up.Clone()).ToList(); + + return TopologyMessageBuilder(PayloadType, updatesClone, StageName, OperatorId, Iteration); + } + + /// + /// The updates for the topology. + /// + internal List TopologyUpdates { get; } + + /// + /// Creates a topology message payload out of memory buffer. + /// + /// The buffer containing a serialized message payload + /// The offset where to start the deserialization process + /// A topology message payload + internal static DriverMessagePayload From(DriverMessagePayloadType type, byte[] data, int offset = 0) + { + int length = BitConverter.ToInt32(data, offset); + offset += sizeof(int); + List updates = TopologyUpdate.Deserialize(data, length, offset); + offset += length; + + length = BitConverter.ToInt32(data, offset); + offset += sizeof(int); + string stage = ByteUtilities.ByteArraysToString(data, offset, length); + offset += length; + int operatorId = BitConverter.ToInt32(data, offset); + offset += sizeof(int); + int iteration = BitConverter.ToInt32(data, offset); + + return TopologyMessageBuilder(type, updates, stage, operatorId, iteration); + } + + /// + /// Utility method to serialize the payload for communication. + /// + /// The serialized payload + internal override byte[] Serialize() + { + byte[] stageBytes = ByteUtilities.StringToByteArrays(StageName); + int offset = 0; + var totalLengthUpdates = TopologyUpdates.Sum(x => x.Size); + byte[] buffer = new byte[sizeof(int) + + totalLengthUpdates + + sizeof(int) + + stageBytes.Length + + sizeof(bool) + + sizeof(int) + + sizeof(int)]; + + Buffer.BlockCopy(BitConverter.GetBytes(totalLengthUpdates), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + + TopologyUpdate.Serialize(buffer, ref offset, TopologyUpdates); + + Buffer.BlockCopy(BitConverter.GetBytes(stageBytes.Length), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + Buffer.BlockCopy(stageBytes, 0, buffer, offset, stageBytes.Length); + offset += stageBytes.Length; + Buffer.BlockCopy(BitConverter.GetBytes(OperatorId), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + Buffer.BlockCopy(BitConverter.GetBytes(Iteration), 0, buffer, offset, sizeof(int)); + + return buffer; + } + + private static DriverMessagePayload TopologyMessageBuilder( + DriverMessagePayloadType type, + List updates, + string stageName, + int operatorId, + int iteration) + { + switch (type) + { + case DriverMessagePayloadType.Update: + return new UpdateMessagePayload(updates, stageName, operatorId, iteration); + case DriverMessagePayloadType.Failure: + return new FailureMessagePayload(updates, stageName, operatorId, iteration); + default: + throw new IllegalStateException($"Topology message type {type} not found."); + } + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyUpdate.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyUpdate.cs new file mode 100644 index 0000000000..8fcc02c025 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyUpdate.cs @@ -0,0 +1,201 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Utilities.Attributes; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Class defining the updates of the topology for a running task. + /// + [Unstable("0.16", "API may change")] + internal sealed class TopologyUpdate : ICloneable + { + /// + /// Create an update for a node containing both the list of children and the root node. + /// + /// The node receiving the update + /// The update to the children of the node + /// The update for the root of the node + public TopologyUpdate(string node, IEnumerable children, string root) + { + Node = node; + Children = children.ToList(); + Root = root; + } + + /// + /// Create an update for a node containing only the list of children. + /// + /// The node receiving the update + /// The update to the children of the node + public TopologyUpdate(string node, IEnumerable children) : this(node, children, string.Empty) + { + } + + /// + /// Create an update for a node containing only the root node. + /// + /// The node receiving the update + /// The update for the root of the node + public TopologyUpdate(string node, string root) : this(node, new List(), root) + { + } + + /// + /// The node receiving the update. + /// + public string Node { get; } + + /// + /// The updates for the children. + /// + public List Children { get; set; } + + /// + /// The updates for the root. + /// + public string Root { get; } + + /// + /// The total memory size for the update (used for serialization). + /// + public int Size + { + get + { + // 1 int for the size of node + // The size of node + // 1 int for the number of children + // 1 int for the length of each children + // The size of the string of each child + // 1 int + the size of root if not null + var nodeSize = sizeof(int) + Node.Length; + var childrenSize = sizeof(int) + (Children.Count * sizeof(int)) + Children.Sum(x => x.Length); + var rootSize = sizeof(int) + Root.Length; + + return nodeSize + childrenSize + rootSize; + } + } + + /// + /// Serialize the update. + /// + /// The memory space where to copy the serialized update + /// Where to start writing in the buffer + /// The updates to serialize + internal static void Serialize(byte[] buffer, ref int offset, IEnumerable updates) + { + byte[] tmpBuffer; + + foreach (var value in updates) + { + Buffer.BlockCopy(BitConverter.GetBytes(value.Node.Length), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + tmpBuffer = ByteUtilities.StringToByteArrays(value.Node); + Buffer.BlockCopy(tmpBuffer, 0, buffer, offset, tmpBuffer.Length); + offset += tmpBuffer.Length; + + Buffer.BlockCopy(BitConverter.GetBytes(value.Children.Count), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + foreach (var child in value.Children) + { + tmpBuffer = ByteUtilities.StringToByteArrays(child); + Buffer.BlockCopy(BitConverter.GetBytes(tmpBuffer.Length), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + Buffer.BlockCopy(tmpBuffer, 0, buffer, offset, tmpBuffer.Length); + offset += tmpBuffer.Length; + } + + if (value.Root == null) + { + Buffer.BlockCopy(BitConverter.GetBytes(0), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + } + else + { + tmpBuffer = ByteUtilities.StringToByteArrays(value.Root); + Buffer.BlockCopy(BitConverter.GetBytes(tmpBuffer.Length), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + Buffer.BlockCopy(tmpBuffer, 0, buffer, offset, tmpBuffer.Length); + offset += tmpBuffer.Length; + } + } + } + + /// + /// Deserialize the update. + /// + /// The memory space where to fetch the serialized updates/param> + /// The total memory size of the serialized updates + /// Where to start reading in the buffer + internal static List Deserialize(byte[] data, int totLength, int start) + { + var result = new List(); + var num = 0; + var length = 0; + var offset = 0; + string value; + string node; + List tmp; + + while (offset < totLength) + { + length = BitConverter.ToInt32(data, start + offset); + offset += sizeof(int); + node = ByteUtilities.ByteArraysToString(data, start + offset, length); + offset += length; + + num = BitConverter.ToInt32(data, start + offset); + offset += sizeof(int); + tmp = new List(); + for (int i = 0; i < num; i++) + { + length = BitConverter.ToInt32(data, start + offset); + offset += sizeof(int); + value = ByteUtilities.ByteArraysToString(data, start + offset, length); + offset += length; + tmp.Add(value); + } + + length = BitConverter.ToInt32(data, start + offset); + offset += sizeof(int); + if (length > 0) + { + value = ByteUtilities.ByteArraysToString(data, start + offset, length); + offset += length; + result.Add(new TopologyUpdate(node, tmp, value)); + } + else + { + result.Add(new TopologyUpdate(node, tmp)); + } + } + + return result; + } + + public object Clone() + { + return new TopologyUpdate(Node, Children, Root); + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/UpdateMessagePayload.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/UpdateMessagePayload.cs new file mode 100644 index 0000000000..adc295df62 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/UpdateMessagePayload.cs @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Messages sent by the driver to operators. + /// This message contains information for the destination node on the topology. + /// + [Unstable("0.16", "API may change")] + internal sealed class UpdateMessagePayload : TopologyMessagePayload + { + /// + /// Create a driver message payload containing topology updates + /// + /// The topology updates + /// The stage context for the message + /// The id of the operator receiving the topology update + /// The iteration in which the update takes effect + public UpdateMessagePayload(IEnumerable updates, string stageName, int operatorId, int iteration) + : base(DriverMessagePayloadType.Update, updates, stageName, operatorId, iteration) + { + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Config/ElasticServiceConfigurationOptions.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/ElasticServiceConfigurationOptions.cs new file mode 100644 index 0000000000..544c811525 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/ElasticServiceConfigurationOptions.cs @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System.Collections.Generic; +using Org.Apache.REEF.Tang.Annotations; + +namespace Org.Apache.REEF.Network.Elastic.Config +{ + /// + ///Class wrapping the configuration option for the elastic + ///group communication service. + /// + public sealed class ElasticServiceConfigurationOptions + { + [NamedParameter("Number of Evaluators")] + public sealed class NumEvaluators : Name + { + } + + [NamedParameter("Number of Servers")] + public sealed class NumServers : Name + { + } + + [NamedParameter("Number of Workers")] + public sealed class NumWorkers : Name + { + } + + [NamedParameter(Documentation = "Number of retry when a failure occurs", DefaultValue = "1")] + public sealed class RetryAfterFailure : Name + { + } + + [NamedParameter(Documentation = "Starting port for TcpPortProvider", DefaultValue = "8900")] + public sealed class StartingPort : Name + { + } + + [NamedParameter(Documentation = "Port Range count for TcpPortProvider", DefaultValue = "1000")] + public sealed class PortRange : Name + { + } + + [NamedParameter("Driver identifier")] + public sealed class DriverId : Name + { + } + + [NamedParameter("Default Group name", defaultValue: "Stage1")] + public sealed class DefaultStageName : Name + { + } + + [NamedParameter("Number of tasks", defaultValue: "5")] + public sealed class NumberOfTasks : Name + { + } + + [NamedParameter("Serialized stages configuration")] + public sealed class SerializedStageConfigs : Name> + { + } + + [NamedParameter("Timeout after which computation is consider inactive", defaultValue: "600000")] + public sealed class Timeout : Name + { + } + + [NamedParameter("Number of retry to send a message", defaultValue: "50")] + public sealed class SendRetry : Name + { + } + + [NamedParameter("Number of millisecond between each message retry", defaultValue: "1000")] + public sealed class RetryWaitTime : Name + { + } + + [NamedParameter("Number of failures before a task abort the task set", defaultValue: "100")] + public sealed class NumTaskFailures : Name + { + } + + [NamedParameter("Number of failures before an evaluator abort the task set", defaultValue: "3")] + public sealed class NumEvaluatorFailures : Name + { + } + + [NamedParameter(Documentation = "Rack name used when a new evaluator is requested", DefaultValue = "WonderlandRack")] + public sealed class NewEvaluatorRackName : Name + { + } + + [NamedParameter(Documentation = "Batch id used when a new evaluator is requested", DefaultValue = "IterateBroadcast")] + public sealed class NewEvaluatorBatchId : Name + { + } + + [NamedParameter(Documentation = "Number of cores used when a new evaluator is requested after a failure", DefaultValue = "1")] + public sealed class NewEvaluatorNumCores : Name + { + } + + [NamedParameter(Documentation = "Memory size used when a new evaluator is requested", DefaultValue = "512")] + public sealed class NewEvaluatorMemorySize : Name + { + } + + [NamedParameter("Number of checkpoints to store per operator", defaultValue: "1")] + public sealed class NumCheckpoints : Name + { + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Config/GroupCommunicationConfigurationOptions.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/GroupCommunicationConfigurationOptions.cs new file mode 100644 index 0000000000..b433709b4d --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/GroupCommunicationConfigurationOptions.cs @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Annotations; + +namespace Org.Apache.REEF.Network.Elastic.Config +{ + /// + ///Class wrapping the configuration option parameters for task-side group communication. + /// + public sealed class GroupCommunicationConfigurationOptions + { + [NamedParameter("Timeout for sending or receiving messages", defaultValue: "10000")] + public class Timeout : Name + { + } + + [NamedParameter("Number of retry to send a message", defaultValue: "15")] + public class Retry : Name + { + } + + [NamedParameter("Timeout for disposing operators when messages are still in queue", defaultValue: "10000")] + public class DisposeTimeout : Name + { + } + + /// + /// Each communication group needs to check and wait until all the other nodes in the group are registered to the NameServer. + /// Sleep time is set between each retry. + /// + [NamedParameter("sleep time (in milliseconds) to wait for nodes to be registered", defaultValue: "60000")] + internal sealed class SleepTimeWaitingForRegistration : Name + { + } + + /// + /// Each Communication group needs to check and wait until all the other nodes in the group are registered to the NameServer. + /// + /// + /// If a node is waiting for others that need to download data, the waiting time could be long. + /// As we can use cancellation token to cancel the waiting for registration, setting this number higher should be OK. + /// + [NamedParameter("Retry times to wait for nodes to be registered", defaultValue: "30")] + internal sealed class RetryCountWaitingForRegistration : Name + { + } + + [NamedParameter("Whether the operator is in a rescheduled task", defaultValue: "false")] + public sealed class IsRescheduled : Name + { + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Config/OperatorParameters.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/OperatorParameters.cs new file mode 100644 index 0000000000..dcfe369a6a --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/OperatorParameters.cs @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Annotations; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Config +{ + /// + ///Class wrapping the configuration option parameters for operators. + /// + public sealed class OperatorParameters + { + [NamedParameter("Operator Name")] + public sealed class OperatorType : Name + { + } + + [NamedParameter("Type of the message")] + public sealed class MessageType : Name + { + } + + [NamedParameter("Operator Id")] + public sealed class OperatorId : Name + { + } + + [NamedParameter("Name of the stage")] + public sealed class StageName : Name + { + } + + [NamedParameter("Serialized operator configuration")] + public sealed class SerializedOperatorConfigs : Name> + { + } + + [NamedParameter("Request topology update", defaultValue: "false")] + public sealed class RequestTopologyUpdate : Name + { + } + + [NamedParameter("Number of iterations")] + public sealed class NumIterations : Name + { + } + + [NamedParameter("Number of element to scatter for each node", defaultValue: "0")] + public sealed class NumScatterElements : Name + { + } + + [NamedParameter("Iteration number to begin with", defaultValue: "1")] + public sealed class StartIteration : Name + { + } + + [NamedParameter("Master Id")] + public sealed class MasterId : Name + { + } + + [NamedParameter("Checkpoint level", defaultValue: "0")] + public sealed class Checkpointing : Name + { + } + + [NamedParameter("Whether the operator is the last to be executed in the stage", defaultValue: "false")] + public sealed class IsLast : Name + { + } + + [NamedParameter("Id of root task in operator topology", defaultValue: "-1")] + public sealed class TopologyRootTaskId : Name + { + } + + [NamedParameter("Ids of child tasks in operator topology")] + public sealed class TopologyChildTaskIds : Name> + { + } + + [NamedParameter("Whether topology updates can be piggybacked to data messages", defaultValue: "false")] + public sealed class PiggybackTopologyUpdates : Name + { + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Config/StreamingCodecConfiguration.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/StreamingCodecConfiguration.cs new file mode 100644 index 0000000000..0863307aca --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/StreamingCodecConfiguration.cs @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Tang.Formats; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Wake.StreamingCodec; + +namespace Org.Apache.REEF.Network.Elastic.Config +{ + /// + /// Defines configuration for streaming codecs of messages. + /// + /// Generic type of message + public sealed class StreamingCodecConfiguration : ConfigurationModuleBuilder + { + /// + /// RequiredImpl for Codec. Client needs to set implementation for this parameter + /// + public static readonly RequiredImpl> Codec = new RequiredImpl>(); + + /// + /// Configuration Module for Codec + /// + public static ConfigurationModule Conf = new StreamingCodecConfiguration() + .BindImplementation(GenericType>.Class, Codec) + .BindImplementation(GenericType>>.Class, + GenericType>.Class) + .BindImplementation(GenericType>>.Class, + GenericType>.Class) + .Build(); + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs new file mode 100644 index 0000000000..8b4ac0614c --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs @@ -0,0 +1,416 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Net; +using Org.Apache.REEF.Common.Io; +using Org.Apache.REEF.Common.Services; +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Network.Naming; +using Org.Apache.REEF.Network.NetworkService; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Formats; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.Elastic.Config; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Wake.Time.Event; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Wake.Remote.Parameters; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic.Task.Impl; +using Org.Apache.REEF.Driver.Evaluator; +using Org.Apache.REEF.Network.Elastic.Failures.Default; + +namespace Org.Apache.REEF.Network.Elastic.Driver.Default +{ + /// + /// Default implementation for the task context. + /// This is mainly used to create stage. + /// Also manages configurations for Elastic Group Communication operators/contexts. + /// + [Unstable("0.16", "API may change")] + internal sealed class DefaultElasticContext : IElasticContext, IDefaultFailureEventResponse + { + private readonly int _startingPort; + private readonly int _portRange; + private readonly string _driverId; + private readonly int _numEvaluators; + private readonly string _nameServerAddr; + private readonly int _nameServerPort; + private readonly INameServer _nameServer; + private readonly string _defaultStageName; + private readonly IFailureStateMachine _defaultFailureMachine; + private readonly IEvaluatorRequestor _evaluatorRequestor; + private readonly int _memory; + private readonly int _cores; + private readonly string _batchId; + private readonly string _rackName; + + private readonly Dictionary _stages = new Dictionary(); + private readonly AvroConfigurationSerializer _configSerializer; + + private readonly object _subsLock = new object(); + private readonly object _statusLock = new object(); + + private IFailureState _failureStatus = new DefaultFailureState(); + + [Inject] + private DefaultElasticContext( + [Parameter(typeof(ElasticServiceConfigurationOptions.StartingPort))] int startingPort, + [Parameter(typeof(ElasticServiceConfigurationOptions.PortRange))] int portRange, + [Parameter(typeof(ElasticServiceConfigurationOptions.DriverId))] string driverId, + [Parameter(typeof(ElasticServiceConfigurationOptions.DefaultStageName))] string defaultStageName, + [Parameter(typeof(ElasticServiceConfigurationOptions.NumEvaluators))] int numEvaluators, + [Parameter(typeof(ElasticServiceConfigurationOptions.NewEvaluatorMemorySize))] int memory, + [Parameter(typeof(ElasticServiceConfigurationOptions.NewEvaluatorNumCores))] int cores, + [Parameter(typeof(ElasticServiceConfigurationOptions.NewEvaluatorBatchId))] string batchId, + [Parameter(typeof(ElasticServiceConfigurationOptions.NewEvaluatorRackName))] string rackName, + AvroConfigurationSerializer configSerializer, + IEvaluatorRequestor evaluatorRequestor, + INameServer nameServer, + IFailureStateMachine defaultFailureStateMachine) + { + _startingPort = startingPort; + _portRange = portRange; + _driverId = driverId; + _numEvaluators = numEvaluators; + _defaultStageName = defaultStageName; + _defaultFailureMachine = defaultFailureStateMachine; + _evaluatorRequestor = evaluatorRequestor; + _memory = memory; + _cores = cores; + _batchId = batchId; + _rackName = rackName; + _configSerializer = configSerializer; + _nameServer = nameServer; + IPEndPoint localEndpoint = nameServer.LocalEndpoint; + _nameServerAddr = localEndpoint.Address.ToString(); + _nameServerPort = localEndpoint.Port; + } + + /// + /// Returns a stage with the default settings (default name and failure machine). + /// + /// A stage with default settings + public IElasticStage DefaultStage() + { + lock (_subsLock) + { + if (_stages.TryGetValue(_defaultStageName, out IElasticStage defaultStage)) + { + return defaultStage; + } + + return CreateNewStage( + _defaultStageName, + _numEvaluators, + _defaultFailureMachine.Clone(_numEvaluators, (int)DefaultFailureStates.Fail)); + } + } + + /// + /// Creates a new stage. + /// The stage lifecicle is managed by the context. + /// + /// The name of the stage + /// The number of tasks required by the stage + /// An optional failure machine governing the stage + /// The new task Set subscrption + public IElasticStage CreateNewStage( + string stageName, + int numTasks, + IFailureStateMachine failureMachine = null) + { + if (string.IsNullOrEmpty(stageName)) + { + throw new ArgumentNullException($"{nameof(stageName)} cannot be null."); + } + + if (numTasks <= 0) + { + throw new ArgumentException($"{nameof(numTasks)} is required to be greater than 0."); + } + + lock (_subsLock) + { + if (_stages.ContainsKey(stageName)) + { + throw new ArgumentException($"Stage {stageName} already registered with the context."); + } + + var stage = new DefaultElasticStage( + stageName, + numTasks, + this, + failureMachine ?? _defaultFailureMachine.Clone(numTasks, (int)DefaultFailureStates.Fail)); + _stages[stageName] = stage; + + return stage; + } + } + + /// + /// Remove a task Set stage from the context. + /// + /// The name of the stage to be removed + public void RemoveElasticStage(string stageName) + { + lock (_subsLock) + { + if (!_stages.Remove(stageName)) + { + throw new ArgumentException($"Stage {stageName} is not registered with the context."); + } + } + } + + /// + /// Generate the base configuration module for tasks. + /// This method is method can be used to generate configurations for the task set menager. + /// + /// The id of the task the configuration is generate for + /// The module with the service properly set up for the task + public ConfigurationModule GetTaskConfigurationModule(string taskId) + { + return TaskConfiguration.ConfigurationModule + .Set(TaskConfiguration.Identifier, taskId) + .Set(TaskConfiguration.OnMessage, GenericType.Class) + .Set(TaskConfiguration.OnClose, GenericType.Class); + } + + /// + /// Start the elastic group communication context. + /// This will trigger requests for resources as specified by the parameters. + /// + public void Start() + { + var request = _evaluatorRequestor.NewBuilder() + .SetNumber(_numEvaluators) + .SetMegabytes(_memory) + .SetCores(_cores) + .SetRackName(_rackName) + .SetEvaluatorBatchId(_batchId) + .Build(); + + _evaluatorRequestor.Submit(request); + } + + /// + /// Create a new task set manager. + /// + /// The configuration for the master task + /// The configuration for the slave task + /// A new task set manager + + public IElasticTaskSetManager CreateNewTaskSetManager( + TaskConfigurator masterTaskConfiguration, + TaskConfigurator slaveTaskConfiguration = null) + { + return CreateNewTaskSetManager(_numEvaluators, masterTaskConfiguration, slaveTaskConfiguration); + } + + /// + /// Create a new task set manager. + /// + /// The number of tasks the task set should manager + /// The configuration for the master task + /// The configuration for the slave task + /// A new task set manager + public IElasticTaskSetManager CreateNewTaskSetManager( + int numOfTasks, + TaskConfigurator masterTaskConfiguration, + TaskConfigurator slaveTaskConfiguration = null) + { + return new DefaultElasticTaskSetManager( + numOfTasks, + _evaluatorRequestor, + _driverId, + masterTaskConfiguration, + slaveTaskConfiguration); + } + + /// + /// Generate the elastic service configuration object. + /// This method is used to properly configure task contexts with the elastic service. + /// + /// The ealstic service configuration + public IConfiguration GetElasticServiceConfiguration() + { + IConfiguration contextConfig = ServiceConfiguration.ConfigurationModule + .Set(ServiceConfiguration.Services, + GenericType>.Class) + .Build(); + + return TangFactory.GetTang().NewConfigurationBuilder(contextConfig) + .BindStringNamedParam(_nameServerAddr) + .BindIntNamedParam("" + _nameServerPort) + .BindImplementation() + .BindIntNamedParam("" + _startingPort) + .BindIntNamedParam("" + _portRange) + .Build(); + } + + /// + /// Appends a stage configuration to a configuration builder object. + /// + /// The configuration where the stage configuration will be appended to + /// The stage configuration at hand + /// The configuration containing the serialized stage configuration + public void SerializeStageConfiguration( + ref ICsConfigurationBuilder confBuilder, + IConfiguration stageConfiguration) + { + confBuilder.BindSetEntry( + GenericType.Class, + _configSerializer.ToString(stageConfiguration)); + } + + /// + /// Append an operator configuration to a configuration builder object. + /// + /// The list where the operator configuration + /// will be appended to + /// The operator configuration at hand + /// The configuration containing the serialized operator configuration + public void SerializeOperatorConfiguration( + ref IList serializedOperatorsConfs, + IConfiguration operatorConfiguration) + { + serializedOperatorsConfs.Add(_configSerializer.ToString(operatorConfiguration)); + } + + #region Failure Response + + /// + /// Used to react on a failure occurred on a task. + /// It gets a failed task as input and in response it produces zero or more failure events. + /// + /// The failed task + /// A list of events encoding the type of actions to be triggered so far + /// If the task failure cannot be properly handled + public void OnTaskFailure(IFailedTask value, ref List failureEvents) + { + var task = value.Id; + _nameServer.Unregister(task); + } + + /// + /// Used to react when a timeout event is triggered. + /// It gets a failed task as input and in response it produces zero or more failure events. + /// + /// The alarm triggering the timeput + /// A list of messages encoding how remote Tasks need to reach + /// The next timeouts to be scheduled + public void OnTimeout(Alarm alarm, ref List msgs, ref List nextTimeouts) + { + } + + /// + /// When a new failure state is reached, this method is used to dispatch + /// such event to the proper failure mitigation logic. + /// It gets a failure event as input and produces zero or more failure response messages + /// for tasks (appended into the event). + /// + /// The failure event to react upon + public void EventDispatcher(ref IFailureEvent @event) + { + switch ((DefaultFailureStateEvents)@event.FailureEvent) + { + case DefaultFailureStateEvents.Reconfigure: + var rec = @event as ReconfigureEvent; + OnReconfigure(ref rec); + break; + + case DefaultFailureStateEvents.Reschedule: + var res = @event as RescheduleEvent; + OnReschedule(ref res); + break; + + case DefaultFailureStateEvents.Stop: + var stp = @event as StopEvent; + OnStop(ref stp); + break; + + default: + OnFail(); + break; + } + } + + #endregion Failure Response + + #region Default Failure event Response + + /// + /// Mechanism to execute when a reconfigure event is triggered. + /// + /// + public void OnReconfigure(ref ReconfigureEvent reconfigureEvent) + { + lock (_statusLock) + { + _failureStatus = _failureStatus.Merge( + new DefaultFailureState((int)DefaultFailureStates.ContinueAndReconfigure)); + } + } + + /// + /// Mechanism to execute when a reschedule event is triggered. + /// + /// + public void OnReschedule(ref RescheduleEvent rescheduleEvent) + { + lock (_statusLock) + { + _failureStatus = _failureStatus.Merge( + new DefaultFailureState((int)DefaultFailureStates.ContinueAndReschedule)); + } + } + + /// + /// Mechanism to execute when a stop event is triggered. + /// + /// + public void OnStop(ref StopEvent stopEvent) + { + lock (_statusLock) + { + _failureStatus = _failureStatus.Merge(new DefaultFailureState((int)DefaultFailureStates.StopAndReschedule)); + } + } + + /// + /// Mechanism to execute when a fail event is triggered. + /// + public void OnFail() + { + lock (_statusLock) + { + _failureStatus = _failureStatus.Merge(new DefaultFailureState((int)DefaultFailureStates.Fail)); + } + } + + #endregion Default Failure event Response + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticDriver.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticDriver.cs new file mode 100644 index 0000000000..8253bb1460 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticDriver.cs @@ -0,0 +1,134 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Driver; +using Org.Apache.REEF.Driver.Context; +using Org.Apache.REEF.Driver.Evaluator; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Implementations.Configuration; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Common.Context; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Driver.Default +{ + /// + /// Default implementation of the elastic driver. + /// + [Unstable("0.16", "API may change")] + public abstract class DefaultElasticDriver : + IObserver, + IObserver, + IObserver, + IObserver, + IObserver, + IObserver, + IObserver, + IObserver + { + [Inject] + protected DefaultElasticDriver(IElasticContext context) + { + Context = context; + } + + public IElasticContext Context { get; } + + public IElasticTaskSetManager TaskSetManager { get; set; } + + public void OnNext(IDriverStarted value) + { + Context.Start(); + } + + public void OnNext(IAllocatedEvaluator allocatedEvaluator) + { + if (TaskSetManager.TryGetNextTaskContextId(allocatedEvaluator, out string identifier)) + { + IConfiguration contextConf = ContextConfiguration.ConfigurationModule + .Set(ContextConfiguration.Identifier, identifier) + .Build(); + IConfiguration serviceConf = Context.GetElasticServiceConfiguration(); + IConfiguration codecConf = TaskSetManager.GetCodecConfiguration(); + + serviceConf = Configurations.Merge(serviceConf, codecConf); + allocatedEvaluator.SubmitContextAndService(contextConf, serviceConf); + } + else + { + allocatedEvaluator.Dispose(); + } + } + + public void OnNext(IActiveContext activeContext) + { + TaskSetManager.OnNewActiveContext(activeContext); + } + + public void OnNext(IRunningTask value) + { + TaskSetManager.OnTaskRunning(value); + } + + public void OnNext(ICompletedTask value) + { + TaskSetManager.OnTaskCompleted(value); + + if (TaskSetManager.IsCompleted) + { + TaskSetManager.Dispose(); + } + } + + public void OnNext(IFailedEvaluator failedEvaluator) + { + TaskSetManager.OnEvaluatorFailure(failedEvaluator); + + if (TaskSetManager.IsCompleted) + { + TaskSetManager.Dispose(); + } + } + + public void OnNext(IFailedTask failedTask) + { + TaskSetManager.OnTaskFailure(failedTask); + + if (TaskSetManager.IsCompleted) + { + TaskSetManager.Dispose(); + } + } + + public void OnNext(ITaskMessage taskMessage) + { + TaskSetManager.OnTaskMessage(taskMessage); + } + + public void OnCompleted() + { + TaskSetManager.Dispose(); + } + + public void OnError(Exception error) + { + TaskSetManager.Dispose(); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs new file mode 100644 index 0000000000..9ce7ba7450 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs @@ -0,0 +1,501 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Utilities.Logging; +using System.Threading; +using Org.Apache.REEF.Driver.Context; +using Org.Apache.REEF.Network.Elastic.Failures; +using System.Collections.Generic; +using Org.Apache.REEF.Network.Elastic.Comm; +using System.Linq; +using Org.Apache.REEF.Wake.Time.Event; +using Org.Apache.REEF.IO.PartitionedData; +using Org.Apache.REEF.Utilities; +using System; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Network.Elastic.Failures.Default; +using Org.Apache.REEF.Network.Elastic.Operators.Logical.Default; +using Org.Apache.REEF.Network.Elastic.Operators.Logical; +using Org.Apache.REEF.Tang.Implementations.Tang; + +namespace Org.Apache.REEF.Network.Elastic.Driver.Default +{ + /// + /// Used to group elastic operators into logical units. + /// All operators in the same stages share similar semantics and behavior + /// under failures. Stages can only be created by a service. + /// This class is used to create stages able to manage default failure events. + /// + [Unstable("0.16", "API may change")] + internal sealed class DefaultElasticStage : IElasticStage, IDefaultFailureEventResponse + { + private static readonly Logger Log = Logger.GetLogger(typeof(DefaultElasticStage)); + + private bool _finalized = false; + private volatile bool _scheduled = false; + + private readonly int _numTasks; + private int _tasksAdded = 0; + private HashSet _missingMasterTasks = new HashSet(); + private HashSet _masterTasks = new HashSet(); + private readonly IFailureStateMachine _failureMachine; + + private int _numOperators; + private Optional _datasetConfiguration; + private bool _isMasterGettingInputData; + + private readonly object _tasksLock = new object(); + private readonly object _statusLock = new object(); + + /// + /// Create a new stage with the input settings. + /// + /// The name of the stage + /// The number of tasks managed by the stage + /// The service managing the stage + /// The failure machine for the stage + internal DefaultElasticStage( + string stageName, + int numTasks, + IElasticContext elasticService, + IFailureStateMachine failureMachine = null) + { + StageName = stageName; + _numTasks = numTasks; + _datasetConfiguration = Optional.Empty(); + Context = elasticService; + _failureMachine = failureMachine ?? new DefaultFailureStateMachine(numTasks, DefaultFailureStates.Fail); + FailureState = _failureMachine.State; + PipelineRoot = new DefaultEmpty(this, _failureMachine.Clone()); + + IsIterative = false; + } + + /// + /// The name of the stages. + /// + public string StageName { get; set; } + + /// + /// The operator at the beginning of the computation workflow. + /// + public ElasticOperator PipelineRoot { get; private set; } + + /// + /// The service managing the stages. + /// + /// Whether the stages contains iterations or not. + /// + public bool IsIterative { get; set; } + + /// + /// The failure state of the target stages. + /// + public IFailureState FailureState { get; private set; } + + /// + /// Whether the stages is completed or not. + /// + public bool IsCompleted + { + get { return FailureState.FailureState.IsComplete(); } + } + + /// + /// Generates an id to uniquely identify operators in the stages. + /// + /// A new unique id + public int GetNextOperatorId() + { + return Interlocked.Increment(ref _numOperators); + } + + /// + /// Add a partitioned dataset to the stage. + /// + /// The partitioned dataset + /// Whether the master node should get a partition + public void AddDataset(IPartitionedInputDataSet inputDataSet, bool isMasterGettingInputData = false) + { + AddDataset(inputDataSet.Select(x => x.GetPartitionConfiguration()).ToArray(), isMasterGettingInputData); + } + + /// + /// Add a set of datasets to the stage. + /// + /// The configuration for the datasets + /// Whether the master node should get a partition + public void AddDataset(IConfiguration[] inputDataSet, bool isMasterGettingInputData = false) + { + _isMasterGettingInputData = isMasterGettingInputData; + + _datasetConfiguration = Optional.Of(inputDataSet); + } + + /// + /// Finalizes the stages. + /// After the stages has been finalized, no more operators can + /// be added to the group. + /// + /// The same finalized stages + public IElasticStage Build() + { + if (_finalized == true) + { + throw new IllegalStateException("Stage cannot be built more than once"); + } + + if (_datasetConfiguration.IsPresent()) + { + var adjust = _isMasterGettingInputData ? 0 : 1; + + if (_datasetConfiguration.Value.Length + adjust < _numTasks) + { + throw new IllegalStateException( + "Dataset is smaller than the number of tasks: " + + $"re-submit with {_datasetConfiguration.Value.Length + adjust} tasks"); + } + } + + PipelineRoot.GatherMasterIds(ref _masterTasks); + + _finalized = true; + + return this; + } + + /// + /// Add a task to the stages. + /// The stages must have been built before tasks can be added. + /// + /// The id of the task to add + /// True if the task is correctly added to the stages + public bool AddTask(string taskId) + { + if (string.IsNullOrEmpty(taskId)) + { + throw new ArgumentException($"{nameof(taskId)} cannot be empty."); + } + + if (IsCompleted || (_scheduled && FailureState.FailureState.IsFail())) + { + Log.Log(Level.Warning, "Taskset {0}." ,IsCompleted ? "completed." : "failed."); + return false; + } + + if (!_finalized) + { + throw new IllegalStateException("Stage must be finalized before adding tasks."); + } + + lock (_tasksLock) + { + // We don't add a task if eventually we end up by not adding the master task + var tooManyTasks = _tasksAdded >= _numTasks; + var notAddingMaster = _tasksAdded + _missingMasterTasks.Count >= _numTasks && + !_missingMasterTasks.Contains(taskId); + + if (!_scheduled && (tooManyTasks || notAddingMaster)) + { + if (tooManyTasks) + { + Log.Log(Level.Warning, + "Already added {0} tasks when total tasks request is {1}", _tasksAdded, _numTasks); + } + + if (notAddingMaster) + { + Log.Log(Level.Warning, + "Already added {0} over {1} but missing master task(s)", _tasksAdded, _numTasks); + } + + return false; + } + + if (PipelineRoot.AddTask(taskId)) + { + _tasksAdded++; + _missingMasterTasks.Remove(taskId); + _failureMachine.AddDataPoints(1, false); + } + } + + return true; + } + + /// + /// Decides if the tasks added to the stages can be scheduled for execution + /// or not. This method is used for implementing different policies for + /// triggering the scheduling of tasks. + /// + /// True if the previously added tasks can be scheduled for execution + public bool ScheduleStage() + { + // Schedule if we reach the number of requested tasks or the stage contains an iterative pipeline + // that is ready to be scheduled and the policy requested by the user allow early start with ramp up. + if (!_scheduled && + (_numTasks == _tasksAdded || + (IsIterative && + _failureMachine.State.FailureState < (int)DefaultFailureStates.StopAndReschedule && + PipelineRoot.CanBeScheduled()))) + { + _scheduled = true; + + PipelineRoot.BuildState(); + } + + return _scheduled; + } + + /// + /// Whether the input activeContext is the one of the master tasks. + /// + /// The active context of the task + /// True if the input parameter is the master task's active context + public bool IsMasterTaskContext(IActiveContext activeContext) + { + if (!_finalized) + { + throw new IllegalStateException("Driver must call Build() before checking IsMasterTaskContext."); + } + + int id = Utils.GetContextNum(activeContext); + return _masterTasks.Any(task => Utils.GetTaskNum(task) == id); + } + + /// + /// Creates the Configuration for the input task. + /// Must be called only after all tasks have been added to the stages. + /// + /// The configuration builder the configuration will be appended to + /// The task id of the task that belongs to this stages + /// The configuration for the Task with added stages informations + public IConfiguration GetTaskConfiguration(int taskId) + { + ICsConfigurationBuilder confBuilder = TangFactory.GetTang().NewConfigurationBuilder(); + IList serializedOperatorsConfs = new List(); + + PipelineRoot.GetTaskConfiguration(ref serializedOperatorsConfs, taskId); + + return confBuilder + .BindStringNamedParam(StageName) + .BindList(serializedOperatorsConfs) + .Build(); + } + + /// + /// Given a task id, this method returns the configuration of the task's data partition + /// (if any). + /// + /// The task id of the task we wanto to retrieve the data partition. + /// The task is required to belong to thq stages + /// The configuration of the data partition (if any) of the task + public Optional GetPartitionConf(string taskId) + { + if (!_datasetConfiguration.IsPresent() || (_masterTasks.Contains(taskId) && !_isMasterGettingInputData)) + { + return Optional.Empty(); + } + + var index = Utils.GetTaskNum(taskId) - 1; + index = _masterTasks.Count == 0 || _isMasterGettingInputData ? index : index - 1; + + if (index < 0 || index >= _datasetConfiguration.Value.Length) + { + throw new IllegalStateException($"Asking for a not existing partition configuration {index}."); + } + + return Optional.Of(_datasetConfiguration.Value[index]); + } + + /// + /// Method used to signal that the stage state can be moved to complete. + /// + public void Complete() + { + lock (_statusLock) + { + FailureState = FailureState.Merge(_failureMachine.Complete()); + } + } + + /// + /// Retrieve the log the final statistics of the computation: this is the sum of all + /// the stats of all the Operators compising the stage. This method can be called + /// only once the stages is completed. + /// + /// The final statistics for the computation + public string LogFinalStatistics() + { + if (IsCompleted || FailureState.FailureState.IsFail()) + { + return PipelineRoot.LogFinalStatistics(); + } + else + { + throw new IllegalStateException( + $"Cannot log statistics before Stage {StageName} is completed or failed."); + } + } + + /// + /// Method triggered when a task to driver message is received. + /// + /// The task message for the operator + /// If the message cannot be handled correctly or generate + /// an incorrent state + public IEnumerable OnTaskMessage(ITaskMessage message) + { + int offset = 0; + var length = BitConverter.ToUInt16(message.Message, offset); + offset += sizeof(ushort); + var stageName = ByteUtilities.ByteArraysToString(message.Message, offset, length); + offset += length; + + if (stageName == StageName) + { + // Messages have to be propagated down to the operators + return PipelineRoot.OnTaskMessage(message); + } + + return new IElasticDriverMessage[] { }; + } + + #region Failure Response + + /// + /// Used to react when a timeout event is triggered. + /// + /// The alarm triggering the timeput + /// A list of messages encoding how remote tasks need to react + /// The next timeouts to be scheduled + public void OnTimeout(Alarm alarm, ref List msgs, ref List nextTimeouts) + { + PipelineRoot.OnTimeout(alarm, ref msgs, ref nextTimeouts); + } + + /// + /// Used to react on a failure occurred on a task. + /// It gets a failed task as input and in response it produces zero or more failure events. + /// + /// The failed task + /// A list of events encoding the type of actions to be triggered so far + /// If the task failure cannot be properly handled + public void OnTaskFailure(IFailedTask task, ref List failureEvents) + { + // Failures have to be propagated down to the operators + PipelineRoot.OnTaskFailure(task, ref failureEvents); + } + + /// + /// When a new failure state is reached, this method is used to dispatch + /// such event to the proper failure mitigation logic. + /// It gets a failure event as input and produces zero or more failure response messages + /// for tasks (appended into the event). + /// + /// The failure event to react upon + public void EventDispatcher(ref IFailureEvent @event) + { + switch ((DefaultFailureStateEvents)@event.FailureEvent) + { + case DefaultFailureStateEvents.Reconfigure: + var rec = @event as ReconfigureEvent; + OnReconfigure(ref rec); + break; + + case DefaultFailureStateEvents.Reschedule: + var res = @event as RescheduleEvent; + OnReschedule(ref res); + break; + + case DefaultFailureStateEvents.Stop: + var stp = @event as StopEvent; + OnStop(ref stp); + break; + + default: + OnFail(); + break; + } + + PipelineRoot.EventDispatcher(ref @event); + } + + #endregion Failure Response + + #region Default Failure Events Response + + /// + /// Mechanism to execute when a reconfigure event is triggered. + /// + /// + public void OnReconfigure(ref ReconfigureEvent reconfigureEvent) + { + lock (_statusLock) + { + FailureState = FailureState.Merge( + new DefaultFailureState((int)DefaultFailureStates.ContinueAndReconfigure)); + } + } + + /// + /// Mechanism to execute when a reschedule event is triggered. + /// + /// + public void OnReschedule(ref RescheduleEvent rescheduleEvent) + { + lock (_statusLock) + { + FailureState = FailureState.Merge( + new DefaultFailureState((int)DefaultFailureStates.ContinueAndReschedule)); + } + } + + /// + /// Mechanism to execute when a stop event is triggered. + /// + /// + public void OnStop(ref StopEvent stopEvent) + { + lock (_statusLock) + { + FailureState = FailureState.Merge( + new DefaultFailureState((int)DefaultFailureStates.StopAndReschedule)); + } + } + + /// + /// Mechanism to execute when a fail event is triggered. + /// + public void OnFail() + { + lock (_statusLock) + { + FailureState = FailureState.Merge(new DefaultFailureState((int)DefaultFailureStates.Fail)); + } + } + + #endregion Default Failure Events Response + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs new file mode 100644 index 0000000000..f124104f19 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs @@ -0,0 +1,1351 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using Org.Apache.REEF.Driver.Context; +using Org.Apache.REEF.Tang.Implementations.Configuration; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Driver.Evaluator; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Network.Elastic.Comm; +using System.Collections.Concurrent; +using Org.Apache.REEF.Wake.Time.Event; +using Org.Apache.REEF.Network.Elastic.Config; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Network.Elastic.Failures.Default; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Driver.Default +{ + /// + /// Class managing the scheduling of tasks and task-related events. + /// + [Unstable("0.16", "API may change")] + internal sealed class DefaultElasticTaskSetManager : + IElasticTaskSetManager, + IDefaultFailureEventResponse, + IObserver + { + #region Private structs + + // Struct managing state for re-scheduling contexts after evaluator failures. + private struct ContextInfo + { + public ContextInfo(int id) + { + Id = id; + NumRetry = 1; + } + + /// + /// The context id. + /// + public int Id { get; private set; } + + /// + /// The number of times we tried to submit the context. + /// + public int NumRetry { get; set; } + } + + #endregion Private structs + + #region Private classes + + /// + /// Wraps all the info required to proper manage a task life cycle. + /// + private sealed class TaskInfo : IDisposable + { + private volatile bool _isTaskDisposed = false; + private volatile bool _isActiveContextDisposed = false; + private volatile bool _isDisposed = false; + + /// + /// Constructor. + /// + /// The inital configuration for the task + /// The active context for the task + /// The evalutor id + /// The task status + /// The stage the task belongs to + public TaskInfo( + IConfiguration config, + IActiveContext context, + string evaluatorId, + TaskState status, + IList stages) + { + TaskConfiguration = config; + ActiveContext = context; + EvaluatorId = evaluatorId; + Stages = stages; + TaskStatus = status; + } + + /// + /// The task configuration. + /// + public IConfiguration TaskConfiguration { get; private set; } + + /// + /// The active context for the task. + /// + public IActiveContext ActiveContext { get; private set; } + + /// + /// Whether the active task context was previously diposed or not. + /// + public bool IsActiveContextDisposed + { + get { return _isActiveContextDisposed; } + } + + /// + /// The id of the evalutor of the task. + /// + public string EvaluatorId { get; private set; } + + /// + /// The stages the task will be exeucting. + /// + public IList Stages { get; private set; } + + /// + /// Configurations when the task will be rescheduled after a failure. + /// + public Dictionary> RescheduleConfigurations = + new Dictionary>(); + + /// + /// Reference to the remote running task. + /// + public IRunningTask TaskRunner { get; private set; } + + /// + /// The current status of the task. + /// + public TaskState TaskStatus { get; private set; } + + /// + /// How many times the task have been scheduled. + /// + public int NumRetry = 1; + + /// + /// Save the reference to the remote running task. + /// + /// The reference to the remote running task + public void SetTaskRunner(IRunningTask taskRunner) + { + if (_isDisposed) + { + throw new IllegalStateException("Cannot set task runner for a disposed task."); + } + + TaskRunner = taskRunner; + _isTaskDisposed = false; + } + + /// + /// Change the status of the task. + /// + /// The new task state + public void SetTaskStatus(TaskState status) + { + TaskStatus = status; + } + + /// + /// Update the task runtime. + /// + /// The active context of the task + /// The id of the evaluator + public void UpdateRuntime(IActiveContext newActiveContext, string evaluatorId) + { + if (_isDisposed) + { + throw new IllegalStateException("Cannot update runtime for a disposed task."); + } + if (!_isActiveContextDisposed) + { + throw new IllegalStateException("Updating Task with not disposed active context."); + } + + ActiveContext = newActiveContext; + EvaluatorId = evaluatorId; + _isActiveContextDisposed = false; + } + + /// + /// Set the task runtime as diposed. + /// + public void DropRuntime() + { + _isActiveContextDisposed = true; + _isTaskDisposed = true; + } + + /// + /// Dipose the task. + /// + public void DisposeTask() + { + if (!_isTaskDisposed) + { + _isTaskDisposed = true; + + TaskRunner?.Dispose(); + } + } + + /// + /// Dipose the active context of the task. + /// + public void DisposeActiveContext() + { + if (!_isActiveContextDisposed) + { + _isActiveContextDisposed = true; + + ActiveContext?.Dispose(); + } + } + + /// + /// Dipose the task info. + /// + public void Dispose() + { + if (!_isDisposed) + { + _isDisposed = true; + + DisposeTask(); + + DisposeActiveContext(); + } + } + } + + /// + /// Represent an event triggered by some timeout registered by the task set. + /// + private sealed class TasksetAlarm : Alarm + { + /// + /// Constructor. + /// + /// The timestamp when the alarm should be triggered + /// The handler of the event triggered by the alarm + public TasksetAlarm(long timestamp, IObserver handler) : base(timestamp, handler) + { + } + } + + /// + /// Class used to define a timeout on the task set triggering an alarm. + /// + private sealed class TaskSetTimeout : ITimeout + { + private readonly IObserver _handler; + private readonly long _offset; + private readonly string _id; + + /// + /// Constructor. + /// + /// The offset used to define when the timeout will be triggered + /// The handler for the alarm + public TaskSetTimeout(long offset, IObserver handler) + { + _handler = handler ?? throw new ArgumentNullException(nameof(handler)); + _offset = offset; + } + + /// + /// Get the actual alarm to be scheduled. + /// + /// The current time + /// + public Alarm GetAlarm(long time) + { + return new TasksetAlarm(time + _offset, _handler); + } + } + + #endregion Private classes + + private static readonly Logger Log = Logger.GetLogger(typeof(DefaultElasticTaskSetManager)); + + private bool _finalized = false; + private volatile bool _disposed = false; + private volatile bool _scheduled = false; + private volatile bool _completed = false; + private readonly DefaultElasticTaskSetManagerParameters _parameters; + + private volatile int _contextsAdded = 0; + private int _tasksAdded = 0; + private int _tasksRunning = 0; + private volatile int _totFailedTasks = 0; + private volatile int _totFailedEvaluators = 0; + + private readonly int _numTasks; + private readonly IEvaluatorRequestor _evaluatorRequestor; + private readonly string _driverId; + private readonly TaskConfigurator _masterTaskConfiguration; + private readonly TaskConfigurator _slaveTaskConfiguration; + + // Task info 0-indexed + private readonly TaskInfo[] _taskInfos; + + private readonly Dictionary _stages = new Dictionary(); + private readonly ConcurrentQueue _queuedTasks = new ConcurrentQueue(); + private readonly ConcurrentQueue _queuedContexts = new ConcurrentQueue(); + + // Used both for knowing which evaluator the task set is responsible for and to + // maintain a mapping betwween evaluators and contextes. + // This latter is necessary because evaluators may fail between context init + // and the time when the context is installed on the evaluator + private readonly ConcurrentDictionary _evaluatorToContextIdMapping = + new ConcurrentDictionary(); + + private IFailureState _failureStatus = new DefaultFailureState(); + private volatile bool _hasProgress = true; + + private readonly object _statusLock = new object(); + + /// + /// Constructor for the task set manager. + /// + /// The total number of tasks in the task set + /// The requestor to spawn new evaluator + /// The id of the dirver + /// The configuration for the master task + /// The configuration for the slave tasks + /// Additional configurations + public DefaultElasticTaskSetManager( + int numTasks, + IEvaluatorRequestor evaluatorRequestor, + string driverId, + TaskConfigurator masterTaskConfiguration, + TaskConfigurator slaveTaskConfiguration = null, + params IConfiguration[] confs) + { + _numTasks = numTasks; + _evaluatorRequestor = evaluatorRequestor; + _driverId = driverId; + _masterTaskConfiguration = masterTaskConfiguration; + _slaveTaskConfiguration = slaveTaskConfiguration ?? masterTaskConfiguration; + + _taskInfos = new TaskInfo[numTasks]; + + var injector = TangFactory.GetTang().NewInjector(confs); + Type parametersType = typeof(DefaultElasticTaskSetManagerParameters); + _parameters = injector.GetInstance(); + + // Set up the timeout + List msgs = null; + var nextTimeouts = new List(); + + OnTimeout(new TasksetAlarm(0, this), ref msgs, ref nextTimeouts); + } + + /// + /// An identifier for the set of stages the task manager is subscribed to. + /// The task set has to be built before retrieving its stages id. + /// + public string StagesId + { + get + { + if (!_finalized) + { + throw new IllegalStateException("Task set have to be built before getting its stages"); + } + + return string.Join("+", _stages.Keys); + } + } + + /// + /// Decides whether more contexts have to be added to this Task Manger or not. + /// + /// True if the number of added contexts is less than the available slots + + public bool HasMoreContextToAdd + { + get + { + return _contextsAdded < _numTasks; + } + } + + /// + /// Whether this task set is done. + /// + public bool IsCompleted + { + get + { + return Completed() && _tasksRunning == 0; + } + } + + /// + /// Subscribe the current task set manager to a new stage. + /// + /// The stage to subscribe to + /// The same finalized task set manager + public IElasticTaskSetManager AddStage(IElasticStage stage) + { + if (_finalized) + { + throw new IllegalStateException("Cannot add stage to an already built task set manager."); + } + + _stages.Add(stage.StageName, stage); + + return this; + } + + /// + /// Method used to generate unique context ids. + /// + /// The evaluator the context will run on + /// A new unique context id + /// True if an new context id is sucessufully created + public bool TryGetNextTaskContextId(IAllocatedEvaluator evaluator, out string identifier) + { + int id; + ContextInfo cinfo; + + if (_queuedTasks.TryDequeue(out id)) + { + identifier = Utils.BuildContextId(StagesId, id); + cinfo = new ContextInfo(id); + _evaluatorToContextIdMapping.TryAdd(evaluator.Id, cinfo); + return true; + } + + if (_queuedContexts.TryDequeue(out cinfo)) + { + identifier = Utils.BuildContextId(StagesId, cinfo.Id); + _evaluatorToContextIdMapping.TryAdd(evaluator.Id, cinfo); + return true; + } + + id = Interlocked.Increment(ref _contextsAdded); + + if (_contextsAdded > _numTasks) + { + Log.Log(Level.Warning, "Trying to schedule too many contexts"); + identifier = string.Empty; + return false; + } + + identifier = Utils.BuildContextId(StagesId, id); + cinfo = new ContextInfo(id); + _evaluatorToContextIdMapping.TryAdd(evaluator.Id, cinfo); + + Log.Log(Level.Info, "Evaluator {0} is scheduled on node {1}", + evaluator.Id, + evaluator.GetEvaluatorDescriptor().NodeDescriptor.HostName); + + return true; + } + + /// + /// Method used to generate unique task ids. + /// + /// The context the task will run on + /// A new task id + public string GetTaskId(IActiveContext context) + { + var id = Utils.GetContextNum(context); + return Utils.BuildTaskId(StagesId, id); + } + + /// + /// Retrieve all stages having the context passed as a parameter as master task context. + /// + /// The target context + /// A list of stages having the master task running on context + public IEnumerable IsMasterTaskContext(IActiveContext activeContext) + { + return _stages.Values.Where(stage => stage.IsMasterTaskContext(activeContext)); + } + + /// + /// Get the configuration of the codecs used for data transmission. + /// The codecs are automatically generated from the operator pipeline. + /// + /// A configuration object with the codecs for data transmission + public IConfiguration GetCodecConfiguration() + { + var conf = TangFactory.GetTang().NewConfigurationBuilder().Build(); + + foreach (var stage in _stages.Values) + { + stage.PipelineRoot.GetCodecConfiguration(ref conf); + } + + return conf; + } + + /// + /// Method implementing how the task set manager should react when a new context is active. + /// + /// The new active context + public void OnNewActiveContext(IActiveContext activeContext) + { + if (!_finalized) + { + throw new IllegalStateException("Task set have to be finalized before adding tasks."); + } + + if (Completed() || Failed()) + { + Log.Log(Level.Warning, "Adding tasks to already completed task set: ignoring."); + activeContext.Dispose(); + return; + } + + _hasProgress = true; + + var taskId = Utils.BuildTaskId(StagesId, Utils.GetContextNum(activeContext)); + var id = Utils.GetContextNum(activeContext) - 1; + var taskInfo = _taskInfos[id]; + + // We reschedule the task only if the context was active (_taskInfos[id] != null) and the task was + // actually scheduled at least once (_taskInfos[id].TaskStatus > TaskStatus.Init) + if (taskInfo?.TaskStatus > TaskState.Init) + { + Log.Log(Level.Info, "{0} already part of task set: going to directly submit it.", taskId); + + lock (taskInfo) + { + taskInfo.UpdateRuntime(activeContext, activeContext.EvaluatorId); + } + + SubmitTask(id); + } + else + { + bool isMaster = IsMasterTaskContext(activeContext).Any(); + + Log.Log(Level.Info, "Task {0} to be scheduled on {1}", taskId, activeContext.EvaluatorId); + + List partialTaskConfs = new List + { + isMaster ? _masterTaskConfiguration(taskId) : _slaveTaskConfiguration(taskId) + }; + + AddTask(taskId, activeContext, partialTaskConfs); + } + } + + /// + /// Finalizes the task set manager. + /// After the task set has been finalized, no more stages can be added. + /// + /// The same finalized task set manager + public IElasticTaskSetManager Build() + { + if (_finalized) + { + throw new IllegalStateException("Task set manager cannot be built more than once"); + } + + _finalized = true; + + return this; + } + + /// + /// Method implementing how the task set manager should react when a notification that a task is + /// running is received. + /// + /// The running task + public void OnTaskRunning(IRunningTask task) + { + if (IsTaskManagedBy(task.Id)) + { + var taskInfo = _taskInfos[Utils.GetTaskNum(task.Id) - 1]; + _hasProgress = true; + + lock (taskInfo) + { + taskInfo.SetTaskRunner(task); + + if (Completed() || Failed()) + { + Log.Log(Level.Info, "Received running from task {0} but task set is completed " + + "or failed: ignoring.", task.Id); + taskInfo.Dispose(); + + return; + } + if (!taskInfo.TaskStatus.IsRunnable()) + { + Log.Log(Level.Info, "Received running from task {0} which is not runnable: ignoring.", + task.Id); + taskInfo.Dispose(); + + return; + } + + if (taskInfo.TaskStatus != TaskState.Running) + { + if (taskInfo.TaskStatus == TaskState.Recovering) + { + foreach (var stage in _stages) + { + stage.Value.AddTask(task.Id); + } + } + + taskInfo.SetTaskStatus(TaskState.Running); + Interlocked.Increment(ref _tasksRunning); + } + } + } + } + + /// + /// Method implementing how the task set manager should react when a notification that a task + /// is completed is received. + /// + /// The completed task + public void OnTaskCompleted(ICompletedTask taskInfo) + { + if (IsTaskManagedBy(taskInfo.Id)) + { + Interlocked.Decrement(ref _tasksRunning); + var id = Utils.GetTaskNum(taskInfo.Id) - 1; + _hasProgress = true; + + lock (_taskInfos[id]) + { + _taskInfos[id].SetTaskStatus(TaskState.Completed); + } + if (Completed()) + { + foreach (var info in _taskInfos.Where(info =>info?.TaskStatus < TaskState.Failed)) + { + info.DisposeTask(); + } + } + } + } + + /// + /// Method implementing how the task set manager should react when a task message is received. + /// + /// A message from a task + public void OnTaskMessage(ITaskMessage message) + { + if (IsTaskManagedBy(message.TaskId)) + { + var id = Utils.GetTaskNum(message.TaskId) - 1; + var returnMessages = new List(); + _hasProgress = true; + + try + { + foreach (var stage in _stages.Values) + { + returnMessages.AddRange(stage.OnTaskMessage(message)); + } + } + catch (IllegalStateException e) + { + Log.Log(Level.Error, e.Message, e); + Fail(message.TaskId); + } + + SendToTasks(returnMessages); + } + } + + #region Failure Response + + /// + /// Used to react on a task failure. + /// + /// The failed task + public void OnTaskFailure(IFailedTask task) + { + var failureEvents = new List(); + + OnTaskFailure(task, ref failureEvents); + } + + /// + /// Used to react when a timeout event is triggered. + /// + /// The alarm triggering the timeput + /// A list of messages encoding how remote tasks need to react + /// The next timeouts to be scheduled + public void OnTimeout(Alarm alarm, ref List msgs, ref List nextTimeouts) + { + var isInit = msgs == null; + + // Taskset is just started, init the timeouts + if (isInit) + { + _hasProgress = false; + Log.Log(Level.Info, "Timeout alarm for task set initialized"); + nextTimeouts.Add(new TaskSetTimeout(_parameters.Timeout, this)); + + foreach (var stage in _stages.Values) + { + stage.OnTimeout(alarm, ref msgs, ref nextTimeouts); + } + } + else if (alarm.GetType() == typeof(TasksetAlarm)) + { + if (!_hasProgress) + { + if (Completed() || Failed()) + { + Log.Log(Level.Warning, "Taskset made no progress in the last {0}ms. Forcing Disposal.", + _parameters.Timeout); + Dispose(); + } + else + { + Log.Log(Level.Error, "Taskset made no progress in the last {0}ms. Aborting.", + _parameters.Timeout); + Fail(); + return; + } + } + else + { + _hasProgress = false; + nextTimeouts.Add(new TaskSetTimeout(_parameters.Timeout, this)); + } + } + else + { + foreach (var stage in _stages.Values) + { + stage.OnTimeout(alarm, ref msgs, ref nextTimeouts); + } + + SendToTasks(msgs); + } + + foreach (var timeout in nextTimeouts) + { + _parameters.Clock.ScheduleAlarm(timeout); + } + } + + /// + /// Used to react on a failure occurred on a task. + /// It gets a failed task as input and in response it produces zero or more failure events. + /// + /// The failed task + /// A list of events encoding the type of actions to be triggered so far + /// If the task failure cannot be properly handled + + public void OnTaskFailure(IFailedTask task, ref List failureEvents) + { + if (IsTaskManagedBy(task.Id)) + { + Log.Log(Level.Info, "Received a failure from {0}", task.Id, task.AsError()); + + Interlocked.Decrement(ref _tasksRunning); + _totFailedTasks++; + _hasProgress = true; + var id = Utils.GetTaskNum(task.Id) - 1; + + if (Completed() || Failed()) + { + Log.Log(Level.Info, "Received a failure from task {0} but the task set is completed or " + + "failed: ignoring the failure", task.Id, task.AsError()); + + lock (_taskInfos[id]) + { + _taskInfos[id].SetTaskStatus(TaskState.Failed); + } + + _taskInfos[id].Dispose(); + + return; + } + + failureEvents = failureEvents ?? new List(); + + lock (_taskInfos[id]) + { + if (_taskInfos[id].TaskStatus < TaskState.Failed) + { + _taskInfos[id].SetTaskStatus(TaskState.Failed); + } + + try + { + foreach (var stage in _taskInfos[id].Stages) + { + stage.OnTaskFailure(task, ref failureEvents); + } + } + catch (Exception e) + { + Log.Log(Level.Error, e.Message, e); + Fail(task.Id); + } + + // Failures have to be propagated up to the context + _taskInfos[id].Stages.First().Context.OnTaskFailure(task, ref failureEvents); + } + + for (int i = 0; i < failureEvents.Count; i++) + { + var @event = failureEvents[i]; + EventDispatcher(ref @event); + } + } + } + + /// + /// Used to react of a failure event occurred on an evaluator. + /// + /// The failed evaluator + public void OnEvaluatorFailure(IFailedEvaluator evaluator) + { + Log.Log(Level.Info, "Received a failure from {0}", evaluator.Id, evaluator.EvaluatorException); + + _totFailedEvaluators++; + + if (evaluator.FailedTask.IsPresent()) + { + var failedTask = evaluator.FailedTask.Value; + var id = Utils.GetTaskNum(failedTask.Id) - 1; + + lock (_taskInfos[id]) + { + _taskInfos[id].DropRuntime(); + } + + OnTaskFailure(failedTask); + _evaluatorToContextIdMapping.TryRemove(evaluator.Id, out ContextInfo cinfo); + } + else + { + _hasProgress = true; + + if (!Completed() && !Failed()) + { + if (_evaluatorToContextIdMapping.TryRemove(evaluator.Id, out ContextInfo cinfo)) + { + int id = cinfo.Id - 1; + var taskInfo = _taskInfos[id]; + + if (taskInfo != null) + { + lock (taskInfo) + { + taskInfo.DropRuntime(); + taskInfo.SetTaskStatus(TaskState.Failed); + } + } + + cinfo.NumRetry++; + + if (cinfo.NumRetry > _parameters.NumEvaluatorFailures) + { + Log.Log(Level.Error, "Context {0} failed more than {1} times: Aborting", + cinfo.Id, + _parameters.NumEvaluatorFailures); + Fail(); + } + + _queuedContexts.Enqueue(cinfo); + } + SpawnNewEvaluator(cinfo.Id); + } + } + } + + /// + /// When a new failure state is reached, this method is used to dispatch + /// such event to the proper failure mitigation logic. + /// It gets a failure event as input and produces zero or more failure response messages + /// for tasks (appended into the event). + /// + /// The failure event to react upon + public void EventDispatcher(ref IFailureEvent @event) + { + var id = Utils.GetTaskNum(@event.TaskId) - 1; + + _taskInfos[id].Stages.First().Context.EventDispatcher(ref @event); + + foreach (var stage in _taskInfos[id].Stages) + { + stage.EventDispatcher(ref @event); + } + + switch ((DefaultFailureStateEvents)@event.FailureEvent) + { + case DefaultFailureStateEvents.Reconfigure: + var rec = @event as ReconfigureEvent; + OnReconfigure(ref rec); + break; + + case DefaultFailureStateEvents.Reschedule: + var res = @event as RescheduleEvent; + OnReschedule(ref res); + break; + + case DefaultFailureStateEvents.Stop: + var stp = @event as StopEvent; + OnStop(ref stp); + break; + + case DefaultFailureStateEvents.Fail: + OnFail(); + break; + + default: + throw new IllegalStateException("Failure event not recognized."); + } + } + + /// + /// Mechanism to execute when a reconfigure event is triggered. + /// + /// + public void OnReconfigure(ref ReconfigureEvent reconfigureEvent) + { + lock (_statusLock) + { + _failureStatus = _failureStatus.Merge( + new DefaultFailureState((int)DefaultFailureStates.ContinueAndReconfigure)); + } + + SendToTasks(reconfigureEvent.FailureResponse); + } + + /// + /// Mechanism to execute when a reschedule event is triggered. + /// + /// + public void OnReschedule(ref RescheduleEvent rescheduleEvent) + { + lock (_statusLock) + { + _failureStatus = _failureStatus.Merge( + new DefaultFailureState((int)DefaultFailureStates.ContinueAndReschedule)); + } + + SendToTasks(rescheduleEvent.FailureResponse); + + Reschedule(rescheduleEvent); + } + + /// + /// Mechanism to execute when a stop event is triggered. + /// + /// + public void OnStop(ref StopEvent stopEvent) + { + lock (_statusLock) + { + _failureStatus = _failureStatus.Merge( + new DefaultFailureState((int)DefaultFailureStates.StopAndReschedule)); + } + + SendToTasks(stopEvent.FailureResponse); + + var rescheduleEvent = stopEvent as RescheduleEvent; + + Reschedule(rescheduleEvent); + } + + /// + /// Mechanism to execute when a fail event is triggered. + /// + public void OnFail() + { + Log.Log(Level.Info, "Task set failed"); + + lock (_statusLock) + { + _failureStatus = _failureStatus.Merge(new DefaultFailureState((int)DefaultFailureStates.Fail)); + } + + Dispose(); + } + + #endregion Failure Response + + public void Dispose() + { + if (!_disposed) + { + _disposed = true; + LogFinalStatistics(); + + foreach (var info in _taskInfos) + { + if (info != null) + { + lock (info) + { + info.Dispose(); + } + } + } + } + } + + /// + /// Whether the imput task is managed by this task set manger. + /// + /// The task identifier + public bool IsTaskManagedBy(string id) + { + return Utils.GetTaskStages(id) == StagesId; + } + + /// + /// Whether the imput context is managed by this task set manger. + /// + /// The context identifier + public bool IsContextManagedBy(string id) + { + return Utils.GetContextStages(id) == StagesId; + } + + /// + /// Whether the imput evaluator is managed by this task set manger. + /// + /// The context identifier + public bool IsEvaluatorManagedBy(string id) + { + return _evaluatorToContextIdMapping.ContainsKey(id); + } + + /// + /// Observer reacting to an alarm event. + /// + /// The alarm + public void OnNext(Alarm alarm) + { + var msgs = new List(); + var nextTimeouts = new List(); + + OnTimeout(alarm, ref msgs, ref nextTimeouts); + } + + public void OnError(Exception error) + { + } + + public void OnCompleted() + { + } + + private void AddTask(string taskId, IActiveContext activeContext, List partialTaskConfigs) + { + Interlocked.Increment(ref _tasksAdded); + var stageList = new List(); + var id = Utils.GetTaskNum(taskId) - 1; + + foreach (var stage in _stages) + { + if (stage.Value.AddTask(taskId)) + { + stageList.Add(stage.Value); + var partitionConf = stage.Value.GetPartitionConf(taskId); + + if (partitionConf.IsPresent()) + { + partialTaskConfigs.Add(partitionConf.Value); + } + } + else + { + Log.Log(Level.Warning, "{0} cannot be added to stage {1}", taskId, stage.Key); + activeContext.Dispose(); + return; + } + } + + var aggregatedConfs = partialTaskConfigs.Aggregate((x, y) => Configurations.Merge(x, y)); + + _taskInfos[id] = new TaskInfo( + aggregatedConfs, + activeContext, + activeContext.EvaluatorId, + TaskState.Init, + stageList); + + if (_scheduled) + { + SubmitTask(id); + } + else if (StartSubmitTasks()) + { + SubmitTasks(); + } + } + + private bool StartSubmitTasks() + { + lock (_statusLock) + { + if (_scheduled) + { + return false; + } + + if (_stages.All(stage => stage.Value.ScheduleStage())) + { + _scheduled = true; + + Log.Log(Level.Info, "Scheduling {0} tasks from Taskset {1}", _tasksAdded, StagesId); + } + } + + return _scheduled; + } + + private void SubmitTasks() + { + for (int i = 0; i < _numTasks; i++) + { + if (_taskInfos[i] != null) + { + SubmitTask(i); + } + } + } + + private void SubmitTask(int id) + { + var taskInfo = _taskInfos[id]; + if (Completed() || Failed()) + { + Log.Log(Level.Warning, "Task submit for a completed or failed Task Set: ignoring."); + taskInfo.DisposeTask(); + + return; + } + + lock (taskInfo) + { + // Check that the task was not already submitted. This may happen for instance if + // _scheduled is set to true and a new active context message is received. + if (taskInfo.TaskStatus == TaskState.Submitted) + { + return; + } + + var stages = taskInfo.Stages; + ICsConfigurationBuilder confBuilder = TangFactory.GetTang().NewConfigurationBuilder(); + var rescheduleConfs = taskInfo.RescheduleConfigurations; + + foreach (var stage in stages) + { + var confSub = stage.GetTaskConfiguration(id + 1); + + if (rescheduleConfs.TryGetValue(stage.StageName, out var confs)) + { + foreach (var additionalConf in confs) + { + confSub = Configurations.Merge(confSub, additionalConf); + } + } + + _stages.Values.First().Context.SerializeStageConfiguration(ref confBuilder, confSub); + } + + IConfiguration baseConf = confBuilder + .BindNamedParameter( + GenericType.Class, + _driverId) + .Build(); + + IConfiguration mergedTaskConf = Configurations.Merge(taskInfo.TaskConfiguration, baseConf); + + if (taskInfo.IsActiveContextDisposed) + { + Log.Log(Level.Warning, + "Task submit for {0} with a non-active context: spawning a new evaluator.", id + 1); + + if (taskInfo.TaskStatus == TaskState.Failed) + { + _queuedTasks.Enqueue(id + 1); + taskInfo.SetTaskStatus(TaskState.Queued); + + SpawnNewEvaluator(id); + } + + return; + } + + taskInfo.ActiveContext.SubmitTask(mergedTaskConf); + taskInfo.SetTaskStatus( + taskInfo.TaskStatus.IsRecoverable() ? + TaskState.Recovering : + TaskState.Submitted); + } + } + + private void SendToTasks(IList messages, int retry = 0) + { + foreach (var returnMessage in messages) + { + if (returnMessage != null) + { + var destination = Utils.GetTaskNum(returnMessage.Destination) - 1; + var taskInfo = _taskInfos[destination] ?? throw new ArgumentNullException("Task Info"); + + lock (taskInfo) + { + if (Completed() || Failed()) + { + Log.Log(Level.Warning, "Task submit for a completed or failed Task Set: ignoring."); + taskInfo.DisposeTask(); + + continue; + } + if (taskInfo.TaskStatus != TaskState.Running || + taskInfo.TaskRunner == null) + { + var msg = $"Cannot send message to {destination + 1}: Task Status is {taskInfo.TaskStatus}:"; + + if (taskInfo.TaskStatus == TaskState.Submitted && retry < _parameters.Retry) + { + Log.Log(Level.Warning, msg + " Retry"); + System.Threading.Tasks.Task.Run(() => + { + Thread.Sleep(_parameters.WaitTime); + SendToTasks(new List() { returnMessage }, retry + 1); + }); + } + else if (retry >= _parameters.Retry) + { + Log.Log(Level.Warning, msg + " Aborting"); + Fail(returnMessage.Destination); + } + else + { + Log.Log(Level.Warning, msg + " Ignoring"); + } + + continue; + } + + taskInfo.TaskRunner.Send(returnMessage.Serialize()); + } + } + } + } + + private void SpawnNewEvaluator(int id) + { + Log.Log(Level.Warning, "Spawning new evaluator for id {0}", id); + + var request = _evaluatorRequestor.NewBuilder() + .SetNumber(1) + .SetMegabytes(_parameters.NewEvaluatorMemorySize) + .SetCores(_parameters.NewEvaluatorNumCores) + .SetRackName(_parameters.NewEvaluatorRackName) + .Build(); + + _evaluatorRequestor.Submit(request); + } + + private void Reschedule(RescheduleEvent rescheduleEvent) + { + var id = Utils.GetTaskNum(rescheduleEvent.TaskId) - 1; + + lock (_taskInfos[id]) + { + _taskInfos[id].NumRetry++; + + if (_taskInfos[id].NumRetry > _parameters.NumTaskFailures) + { + Log.Log(Level.Error, "Task {0} failed more than {1} times: aborting", + rescheduleEvent.TaskId, + _parameters.NumTaskFailures); + Fail(rescheduleEvent.TaskId); + } + + if (rescheduleEvent.Reschedule) + { + Log.Log(Level.Info, "Rescheduling task {0}", rescheduleEvent.TaskId); + + _taskInfos[id].RescheduleConfigurations = rescheduleEvent.RescheduleTaskConfigurations; + + SubmitTask(id); + } + } + } + + private void Fail(string taskId = "") + { + IFailureEvent @event = new FailEvent(taskId); + + EventDispatcher(ref @event); + } + + private void LogFinalStatistics() + { + if (Log.IsLoggable(Level.Info)) + { + Log.Log(Level.Info, "Total Failed Tasks: {0}\nTotal Failed Evaluators: {1}\n{2}", + _totFailedTasks, + _totFailedEvaluators, + string.Join("\n", _stages.Select(x => x.Value.LogFinalStatistics()))); + } + } + + private bool Completed() + { + if (!_completed) + { + _completed = _stages.Values.All(stage => stage.IsCompleted); + + if (_completed) + { + Log.Log(Level.Info, "Task set completed."); + } + } + + return _completed; + } + + private bool Failed() + { + return _failureStatus.FailureState.IsFail(); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManagerParameters.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManagerParameters.cs new file mode 100644 index 0000000000..ec54652c41 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManagerParameters.cs @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Config; +using System.Threading.Tasks; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Driver.Default +{ + /// + /// Injectable class containing all the parameters for the default task set manager. + /// + [Unstable("0.16", "API may change")] + internal sealed class DefaultElasticTaskSetManagerParameters + { + [Inject] + private DefaultElasticTaskSetManagerParameters( + FailuresClock clock, + [Parameter(typeof(ElasticServiceConfigurationOptions.Timeout))] int timeout, + [Parameter(typeof(ElasticServiceConfigurationOptions.SendRetry))] int retry, + [Parameter(typeof(ElasticServiceConfigurationOptions.RetryWaitTime))] int waitTime, + [Parameter(typeof(ElasticServiceConfigurationOptions.NumTaskFailures))] int numTaskFailures, + [Parameter(typeof(ElasticServiceConfigurationOptions.NumEvaluatorFailures))] int numEvaluatorFailures, + [Parameter(typeof(ElasticServiceConfigurationOptions.NewEvaluatorRackName))] string rackName, + [Parameter(typeof(ElasticServiceConfigurationOptions.NewEvaluatorBatchId))] string batchId, + [Parameter(typeof(ElasticServiceConfigurationOptions.NewEvaluatorNumCores))] int numCores, + [Parameter(typeof(ElasticServiceConfigurationOptions.NewEvaluatorMemorySize))] int memorySize) + { + Clock = clock; + Timeout = timeout; + Retry = retry; + WaitTime = waitTime; + NumTaskFailures = numTaskFailures; + NumEvaluatorFailures = numEvaluatorFailures; + NewEvaluatorRackName = rackName; + NewEvaluatorBatchId = batchId; + NewEvaluatorNumCores = numCores; + NewEvaluatorMemorySize = memorySize; + + System.Threading.Tasks.Task.Factory.StartNew(Clock.Run, TaskCreationOptions.LongRunning); + } + + /// + /// The clock for scheduling alarms. + /// + public FailuresClock Clock { get; } + + /// + /// Timeout after which computation is considered inactive. + /// + public int Timeout { get; } + + /// + /// How many times a message communication can be retried. + /// + public int Retry { get; } + + /// + /// How much time to wait between messages retry. + /// + public int WaitTime { get; } + + /// + /// Supported number of task failures. + /// + public int NumTaskFailures { get; } + + /// + /// Supported number of evaluator failures. + /// + public int NumEvaluatorFailures { get; } + + /// + /// The rack name when spawning new evaluators. + /// + public string NewEvaluatorRackName { get; } + + /// + /// The batch id when spawning new evaluators. + /// + public string NewEvaluatorBatchId { get; } + + /// + /// Number of cores for new evaluators. + /// + public int NewEvaluatorNumCores { get; } + + /// + /// Memory size for new evaluators. + /// + public int NewEvaluatorMemorySize { get; } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/TaskState.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/TaskState.cs new file mode 100644 index 0000000000..0e828337c3 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/TaskState.cs @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System.Linq; + +namespace Org.Apache.REEF.Network.Elastic.Driver.Default +{ + /// + /// Definition of the the different states in which a task can be. + /// + internal enum TaskState + { + Init = 1, + + Queued = 2, + + Submitted = 3, + + Recovering = 4, + + Running = 5, + + Failed = 6, + + Completed = 7 + } + + /// + /// Utility class used to recognize particular task states. + /// + internal static class TaskStateUtils + { + private static readonly TaskState[] Recoverable = { TaskState.Failed, TaskState.Queued }; + + private static readonly TaskState[] NotRunnable = { TaskState.Failed, TaskState.Completed }; + + /// + /// Whether a task is recoverable or not. + /// + /// The current state of the task + /// True if the task is recoverable + public static bool IsRecoverable(this TaskState state) + { + return Recoverable.Contains(state); + } + + /// + /// Whether a task can be run or not. + /// + /// The current state of the task + /// True if the task can be run + public static bool IsRunnable(this TaskState state) + { + return !NotRunnable.Contains(state); + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticContext.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticContext.cs new file mode 100644 index 0000000000..8b96905cbf --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticContext.cs @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Driver.Default; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Formats; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Driver +{ + /// + /// Delegate used to generate the task configuration for the input task. + /// + /// The identifier for the task + /// + public delegate IConfiguration TaskConfigurator(string taskId); + + /// + /// This is the entry point for enabling the Elastic Group Communication. + /// The workflow is the following: + /// (1) Create a context instance; + /// (2) Use the context to create one or more stages; + /// (3) Use the stage to create a pipeline of operators representing the + /// communication pattern the tasks should implement; + /// (4) Create one or more task set managers to manage the scheduling of the tasks; + /// (5) Register stage to the manager to properly configure the task set. + /// + /// This interface is mainly used to create elastic stages. + /// Also manages configurations for elastic group communication operators/stages. + /// + [Unstable("0.16", "API may change")] + [DefaultImplementation(typeof(DefaultElasticContext))] + public interface IElasticContext : IFailureResponse + { + /// + /// Creates a stage with the default settings. + /// The stage lifecicle is managed by the context. + /// + /// A new stage with default parameters + IElasticStage DefaultStage(); + + /// + /// Creates a new stage. + /// The stage lifecicle is managed by the context. + /// + /// The name of the stage + /// The number of tasks required by the stage + /// An optional failure machine governing the stage + /// The new task Set subscrption + IElasticStage CreateNewStage(string stageName, int numTasks, IFailureStateMachine failureMachine = null); + + /// + /// Remove a stage from the context. + /// + /// The name of the stage to be removed + void RemoveElasticStage(string stageName); + + /// + /// Generate the base configuration module for tasks. + /// This method can be used to generate configurations for the task set menager. + /// + /// The id of the task the configuration is generate for + /// The module with the service properly set up for the task + ConfigurationModule GetTaskConfigurationModule(string taskId); + + /// + /// Start the elastic group communicatio context. + /// This will trigger requests for resources as specified by the parameters. + /// + void Start(); + + /// + /// Create a new task set manager. + /// + /// The configuration for the master task + /// The configuration for the slave task + /// A new task set manager + IElasticTaskSetManager CreateNewTaskSetManager( + TaskConfigurator masterTaskConfiguration, + TaskConfigurator slaveTaskConfiguration = null); + + /// + /// Create a new task set manager. + /// + /// The number of tasks the task set should manager + /// The configuration for the master task + /// The configuration for the slave task + /// A new task set manager + IElasticTaskSetManager CreateNewTaskSetManager( + int numOfTasks, + TaskConfigurator masterTaskConfiguration, + TaskConfigurator slaveTaskConfiguration = null); + + /// + /// Generate the elastic service configuration object. + /// This method is used to properly configure task contexts with the elastic service. + /// + /// The ealstic service configuration + IConfiguration GetElasticServiceConfiguration(); + + #region Serialization Helpers + /// + /// Append a stage configuration to a configuration builder object. + /// + /// The configuration where the stage configuration will be appended to + /// The stage configuration at hand + /// The configuration containing the serialized stage configuration + void SerializeStageConfiguration(ref ICsConfigurationBuilder confBuilder, IConfiguration stageConf); + + /// + /// Append an operator configuration to a configuration builder object. + /// + /// The list where the operator configuration will be appended to + /// The operator configuration at hand + /// The configuration containing the serialized operator configuration + void SerializeOperatorConfiguration(ref IList serializedOperatorsConfs, IConfiguration operatorConfiguration); + #endregion + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs new file mode 100644 index 0000000000..a888330ffa --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs @@ -0,0 +1,149 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Network.Elastic.Operators.Logical; +using Org.Apache.REEF.Driver.Context; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.IO.PartitionedData; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Network.Elastic.Comm; + +namespace Org.Apache.REEF.Network.Elastic.Driver +{ + /// + /// Used to group elastic operators into logical units. + /// All operators in the same stages share similar semantics and behavior + /// under failures. Stages can only be created by a context. + /// + [Unstable("0.16", "API may change")] + public interface IElasticStage : IFailureResponse, ITaskMessageResponse + { + /// + /// The name of the stages. + /// + string StageName { get; } + + /// + /// The operator at the beginning of the computation workflow. + /// + ElasticOperator PipelineRoot { get; } + + /// + /// The failure state of the target stages. + /// + IFailureState FailureState { get; } + + /// + /// The context where the stage is created. + /// + IElasticContext Context { get; } + + /// + /// Whether the stages is completed or not. + /// + bool IsCompleted { get; } + + /// + /// Whether the stages contains iterations or not. + /// + bool IsIterative { get; set; } + + /// + /// Generates an id to uniquely identify operators in the stages. + /// + /// A new unique id + int GetNextOperatorId(); + + /// + /// Add a partitioned dataset to the stage. + /// + /// The partitioned dataset + /// Whether the master node should get a partition + void AddDataset(IPartitionedInputDataSet inputDataSet, bool isMasterGettingInputData = false); + + /// + /// Add a set of datasets to the stage. + /// + /// The configuration for the datasets + /// Whether the master node should get a partition + void AddDataset(IConfiguration[] inputDataSet, bool isMasterGettingInputData = false); + + /// + /// Finalizes the stages. + /// After the stages has been finalized, no more operators can + /// be added to the group. + /// + /// The same finalized stages + IElasticStage Build(); + + /// + /// Add a task to the stages. + /// The stages must have been buit before tasks can be added. + /// + /// The id of the task to add + /// True if the task is correctly added to the stages + bool AddTask(string taskId); + + /// + /// Decides if the tasks added to the stages can be scheduled for execution + /// or not. This method is used for implementing different policies for + /// triggering the scheduling of tasks. + /// + /// True if the previously added tasks can be scheduled for execution + bool ScheduleStage(); + + /// + /// Whether the input activeContext is the one of the master tasks. + /// + /// The active context of the task + /// True if the input parameter is the master task's active context + bool IsMasterTaskContext(IActiveContext activeContext); + + /// + /// Creates the Configuration for the input task. + /// Must be called only after all tasks have been added to the stages. + /// + /// The configuration builder the configuration will be appended to + /// The task id of the task that belongs to this stages + /// The configuration for the Task with added stages informations + IConfiguration GetTaskConfiguration(int taskId); + + /// + /// Given a task id, this method returns the configuration of the task's data partition + /// (if any). + /// + /// The task id of the task we wanto to retrieve the data partition. + /// The task is required to belong to thq stages + /// The configuration of the data partition (if any) of the task + Optional GetPartitionConf(string taskId); + + /// + /// Method used to signal that the stage state can be moved to complete. + /// + void Complete(); + + /// + /// Retrieve the log the final statistics of the computation: this is the sum of all + /// the stats of all the Operators compising the stage. This method can be called + /// only once the stages is completed. + /// + /// The final statistics for the computation + string LogFinalStatistics(); + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetManager.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetManager.cs new file mode 100644 index 0000000000..7fa4fdad16 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetManager.cs @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Driver.Context; +using Org.Apache.REEF.Driver.Evaluator; +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Utilities.Attributes; +using System; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Driver +{ + /// + /// Class defining how groups of tasks sharing similar scheduling semantics are managed. + /// Task set managers subscribe to stages in order to define tasks logic. + /// Task set managers schedule and manage group of tasks running in the cluster. + /// + [Unstable("0.16", "API may change")] + public interface IElasticTaskSetManager : IFailureResponse, IDisposable + { + /// + /// An identifier for the set of Stages the Task Manager is subscribed to. + /// The task set has to be built before retrieving its stages id. + /// + string StagesId { get; } + + /// + /// Decides whether more contexts have to be added to this Task Manger or not. + /// + /// True if the number of added contexts is less than the available slots + bool HasMoreContextToAdd { get; } + + /// + /// Whether this task set manger is done. + /// + bool IsCompleted { get; } + + /// + /// Subscribe the current task set manager to a new stage. + /// + /// The stage to subscribe to + /// The task manager with the added stage + IElasticTaskSetManager AddStage(IElasticStage stage); + + /// + /// Method used to generate unique context ids. + /// + /// The evaluator the context will run on + /// A new unique context id + /// True if an new context id is sucessufully created + bool TryGetNextTaskContextId(IAllocatedEvaluator evaluator, out string identifier); + + /// + /// Method used to generate unique task ids. + /// + /// The context the task will run on + /// A new task id + string GetTaskId(IActiveContext context); + + /// + /// Finalizes the task set manager. + /// After the task set has been finalized, no more stages can be added. + /// + /// The same finalized task set manager + IElasticTaskSetManager Build(); + + /// + /// Retrieves all stages having the context passed as a parameter + /// as master task context. + /// + /// The target context + /// A list of stages having the master task running on context + IEnumerable IsMasterTaskContext(IActiveContext context); + + /// + /// Get the configuration of the codecs used for data transmission. + /// The codecs are automatically generated from the operator pipeline. + /// + /// A configuration object with the codecs for data transmission + IConfiguration GetCodecConfiguration(); + + /// + /// Method implementing how the task set manager should react when a new context is active. + /// + /// The new active context + void OnNewActiveContext(IActiveContext activeContext); + + /// + /// Method implementing how the task set manager should react when a notification that a task is running is received. + /// + /// The running task + void OnTaskRunning(IRunningTask task); + + /// + /// Method implementing how the task set manager should react when a notification that a task is completed is received. + /// + /// The completed task + void OnTaskCompleted(ICompletedTask task); + + /// + /// Method implementing how the task set manager should react when a task message is received. + /// + /// A message from a task + void OnTaskMessage(ITaskMessage message); + + /// + /// Whether the imput task is managed by this task set manger. + /// + /// The task identifier + bool IsTaskManagedBy(string id); + + /// + /// Whether the imput context is managed by this task set manger. + /// + /// The context identifier + bool IsContextManagedBy(string id); + + /// + /// Whether the imput evaluator is managed by this task set manger. + /// + /// The context identifier + bool IsEvaluatorManagedBy(string id); + + /// + /// Used to react on a task failure. + /// + /// The failed task + void OnTaskFailure(IFailedTask task); + + /// + /// Used to react of a failure event occurred on an evaluator. + /// + /// The failed evaluator + void OnEvaluatorFailure(IFailedEvaluator evaluator); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetService.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetService.cs deleted file mode 100644 index dfd87b358b..0000000000 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetService.cs +++ /dev/null @@ -1,95 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Network.Elastic.Failures; -using Org.Apache.REEF.Tang.Interface; -using Org.Apache.REEF.Utilities.Attributes; - -namespace Org.Apache.REEF.Network.Elastic.Driver -{ - /// - /// Used to create Subscriptions for fault tolerant Task Sets. - /// Also manages configurations for Group Communication operators/services. - /// - [Unstable("0.16", "API may change")] - public interface IElasticTaskSetService : IFailureResponse - { - /// - /// Creates a Subscription with the default settings. - /// The subscription lifecicle is managed by the service. - /// - /// A new Task Set Subscription with default parameters - IElasticTaskSetSubscription DefaultTaskSetSubscription(); - - /// - /// Creates a new Task Set Subscription. - /// The subscription lifecicle is managed by the service. - /// - /// The name of the subscription - /// The number of tasks required by the subscription - /// An optional failure machine governing the subscription - /// The new Task Set Subscrption - IElasticTaskSetSubscription NewTaskSetSubscription(string subscriptionName, int numTasks, IFailureStateMachine failureMachine = null); - - /// - /// Remove a Task Set Subscription from the service. - /// - /// The name of the subscription - void RemoveTaskSetSubscription(string subscriptionName); - - /// - /// Generate the service configuration object. - /// This method is used to properly configure the Context with the service. - /// - /// The Service Configuration - IConfiguration GetServiceConfiguration(); - - /// - /// At task submission time the following steps are executed: - /// 1) Each subscription the task is registered to generates a task subscription - /// 2) Internally each configuration generated by subscriptions contains a configuration entry for each - /// operator defining the subscription. Such operator configurations are serialized using - /// {@link Org.Apache.REEF.Network.Elastic.Driver.IElasticTaskSetService#SerializeOperatorConfiguration} - /// 3) Tasks subscriptions are serialized into a configuration - /// 4) The service Task configuration is added to the configuration object containing the serialized subscription confs - /// 5) the Task configuration is merged with the configuraiton object of 4) to generate the final task configuration - /// - /// - /// - /// Creates a generic Task Configuration object for the tasks registering to the service. - /// - /// The configuration of the subscription the task will register to - /// The configuration for the Task with added service parameters - IConfiguration GetTaskConfiguration(ICsConfigurationBuilder subscriptionsConf); - - /// - /// Appends a subscription configuration to a configuration builder object. - /// - /// The configuration where the subscription configuration will be appended to - /// The subscription configuration at hand - /// The configuration containing the serialized subscription configuration - void SerializeSubscriptionConfiguration(ref ICsConfigurationBuilder confBuilder, IConfiguration subscriptionConf); - - /// - /// Append an operator configuration to a configuration builder object. - /// - /// The configuration where the operator configuration will be appended to - /// The operator configuration at hand - /// The configuration containing the serialized operator configuration - void SerializeOperatorConfiguration(ref ICsConfigurationBuilder confBuilder, IConfiguration operatorConf); - } -} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetSubscription.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetSubscription.cs deleted file mode 100644 index fbc1c48bd9..0000000000 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetSubscription.cs +++ /dev/null @@ -1,94 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Tang.Interface; -using Org.Apache.REEF.Driver.Context; -using Org.Apache.REEF.Network.Elastic.Failures; -using Org.Apache.REEF.Utilities.Attributes; - -namespace Org.Apache.REEF.Network.Elastic.Driver -{ - /// - /// Used to group operators in logical units. - /// All operators in the same Subscription share similar semantics - /// and behaviour under failures. - /// - [Unstable("0.16", "API may change")] - public interface IElasticTaskSetSubscription : IFailureResponse - { - /// - /// The name of the Subscription. - /// - string SubscriptionName { get; } - - /// - /// The Failure State of the target Subscription. - /// - IFailureState FailureStatus { get; } - - /// - /// The Service managing the Subscription. - /// - IElasticTaskSetService Service { get; } - - /// - /// Generates an id to uniquely identify operators in the Subscription. - /// - /// A new unique id - int GetNextOperatorId(); - - /// - /// Finalizes the Subscription. - /// After the Subscription has been finalized, no more operators may - /// be added to the group. - /// - /// The same finalized Subscription - IElasticTaskSetSubscription Build(); - - /// - /// Add a task to the Subscription. - /// The Subscription must have called Build() before adding tasks. - /// - /// The id of the task to add - /// True if the task is added to the Subscription - bool AddTask(string taskId); - - /// - /// Decides if the tasks added to the Subscription can be scheduled for execution - /// or not. Method used for implementing different policies for - /// triggering the scheduling of tasks. - /// - /// True if the added tasks can be scheduled for execution - bool ScheduleSubscription(); - - /// - /// Whether the input activeContext is the one of the master Task. - /// - /// The active context for the task - /// True if the parameter is the master task's active context - bool IsMasterTaskContext(IActiveContext activeContext); - - /// - /// Creates the Configuration for the input task. - /// Must be called only after all tasks have been added to the Subscription. - /// - /// The configuration builder the configuration will be appended to - /// The task id of the task that belongs to this Subscription - /// The configuration for the Task with added Subscription informations - void GetTaskConfiguration(ref ICsConfigurationBuilder builder, int taskId); - } -} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/ITaskSetManager.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/ITaskSetManager.cs deleted file mode 100644 index 4d4e8063d8..0000000000 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/ITaskSetManager.cs +++ /dev/null @@ -1,127 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using System.Collections.Generic; -using Org.Apache.REEF.Driver.Context; -using Org.Apache.REEF.Tang.Interface; -using Org.Apache.REEF.Driver.Evaluator; -using Org.Apache.REEF.Driver.Task; -using Org.Apache.REEF.Network.Elastic.Failures; -using Org.Apache.REEF.Utilities.Attributes; -using System; - -namespace Org.Apache.REEF.Network.Elastic.Driver -{ - /// - /// Class defining how groups of tasks sharing similar scheduling semantics are managed. - /// TaskSets subscribe to Subscriptions in order to define tasks logic. - /// TaskSets schedule and manage group of tasks running in the cluster. - /// - [Unstable("0.16", "API may change")] - public interface ITaskSetManager : IFailureResponse, IDisposable - { - /// - /// An identifier for the set of Subscriptions the Task Manager is subscribed to. - /// The Task Set has to be built before retrieving its subscriptions id. - /// - string SubscriptionsId { get; } - - /// - /// Subscribe the current Task Set to a new Subscription. - /// - /// The subscription to subscribe to - void AddTaskSetSubscription(IElasticTaskSetSubscription subscription); - - /// - /// Decides whether more contexts have to be added to this Task Manger or not. - /// - /// True if the number of added contexts is less than the available slots - bool HasMoreContextToAdd(); - - /// - /// Method used to generate unique context ids. - /// - /// The evaluator the context will run on - /// A new unique context id - int GetNextTaskContextId(IAllocatedEvaluator evaluator = null); - - /// - /// Method used to generate unique task ids. - /// - /// The context the task will run on - /// A new task id - int GetNextTaskId(IActiveContext context = null); - - /// - /// Finalizes the Task Set. - /// After the Task set has been finalized, no more Subscriptions can be added. - /// - /// The same finalized Task Set - ITaskSetManager Build(); - - /// - /// Retrieves all Subscriptions having the context passed as a parameter - /// as master task context. - /// - /// The target context - /// A list of Subscriptions having the master task running on context - IEnumerable IsMasterTaskContext(IActiveContext context); - - /// - /// Add a task to the Task Set. - /// The Task Set must have called Build() before adding tasks. - /// - /// The id of the task to add - /// The current configuration of the task - /// The context the task will run on - void AddTask(string taskId, IConfiguration taskConfig, IActiveContext context); - - /// - /// Actions to execute when a notification that a task is running is received. - /// - /// The running task - void OnTaskRunning(IRunningTask task); - - /// - /// Actions to execute when a notification that a task is completed is received. - /// - /// The completed task - void OnTaskCompleted(ICompletedTask task); - - /// - /// Actions to execute when a task message is received. - /// - /// A message from a task - void OnTaskMessage(ITaskMessage message); - - /// - /// This method contains the logic to trigger when the Task Set execution is completed - /// - bool Done(); - - /// - /// Used to react of a failure event occurred on an evaluator. - /// - /// The failed evaluator - void OnEvaluatorFailure(IFailedEvaluator evaluator); - - /// - /// Contains the logic to trigger when the execution fails. - /// - void OnFail(); - } -} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureState.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureState.cs new file mode 100644 index 0000000000..bdee88e901 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureState.cs @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; +using System; + +/// +/// The default implementation for IFailureState. +/// These events are generated based on the default failure states defined in the enum. +/// +namespace Org.Apache.REEF.Network.Elastic.Failures.Default +{ + [Unstable("0.16", "API may change")] + public sealed class DefaultFailureState : IFailureState + { + /// + /// Create a default failure state for 0 (Continue). + /// + public DefaultFailureState() + { + FailureState = (int)DefaultFailureStates.Continue; + } + + /// + /// Create a default failure state for the input state. + /// + /// The input state we want to create a failure state from + public DefaultFailureState(int state) + { + FailureState = state; + } + + /// + /// The current failure state. It is assumed that bigger values mean worst + /// failure state. + /// + public int FailureState { get; set; } + + /// + /// A utility method to merge the current failure states and a new one passed as + /// parameter. The merging is based on user defined semantic. + /// + /// A new failure state + /// The merge of the two failure states + public IFailureState Merge(IFailureState that) + { + return new DefaultFailureState(Math.Max(FailureState, that.FailureState)); + } + + public static Tuple Threshold(DefaultFailureStates state, float weight) + { + return new Tuple(new DefaultFailureState((int)state), weight); + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateEvents.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateEvents.cs new file mode 100644 index 0000000000..e63673fa64 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateEvents.cs @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Failures.Default +{ + /// + /// The list of default failure events triggered by default state chages. + /// + [Unstable("0.16", "The default evens may change")] + public enum DefaultFailureStateEvents : int + { + Continue = 1, + + Reconfigure = 2, + + Reschedule = 3, + + Stop = 4, + + Fail = 5 + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs new file mode 100644 index 0000000000..a904322bb2 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs @@ -0,0 +1,320 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Utilities.Attributes; +using System; +using System.Collections.Generic; +using System.Linq; + +/// +/// The default implementation of the failure state machine. +/// This implementation has 4 states: +/// - Continue the computation and ignore the failures +/// - Continue and reconfigure the operators based on the received failures +/// - Continue, reconfigure the operators while trying to reshedule failed tasks +/// - Stop the computation and try to reschedule the tasks +/// - Fail. +/// +namespace Org.Apache.REEF.Network.Elastic.Failures.Default +{ + [Unstable("0.16", "API may change")] + public sealed class DefaultFailureStateMachine : IFailureStateMachine + { + private readonly object _statusLock = new object(); + + private readonly static SortedDictionary TransitionMapUp = + new SortedDictionary + { + { DefaultFailureStates.Continue, DefaultFailureStates.ContinueAndReconfigure }, + { DefaultFailureStates.ContinueAndReconfigure, DefaultFailureStates.ContinueAndReschedule }, + { DefaultFailureStates.ContinueAndReschedule, DefaultFailureStates.StopAndReschedule }, + { DefaultFailureStates.StopAndReschedule, DefaultFailureStates.Fail } + }; + + private readonly static SortedDictionary TransitionMapDown = + new SortedDictionary + { + { DefaultFailureStates.ContinueAndReconfigure, DefaultFailureStates.Continue }, + { DefaultFailureStates.ContinueAndReschedule, DefaultFailureStates.ContinueAndReconfigure }, + { DefaultFailureStates.StopAndReschedule, DefaultFailureStates.ContinueAndReschedule }, + { DefaultFailureStates.Fail, DefaultFailureStates.StopAndReschedule } + }; + + private readonly SortedDictionary transitionWeights = + new SortedDictionary + { + { DefaultFailureStates.ContinueAndReconfigure, 0.01F }, + { DefaultFailureStates.ContinueAndReschedule, 0.40F }, + { DefaultFailureStates.StopAndReschedule, 0.60F }, + { DefaultFailureStates.Fail, 0.80F } + }; + + private readonly static int[] CanMoveToComplete = new int[] + { + (int)DefaultFailureStates.Continue, + (int)DefaultFailureStates.ContinueAndReconfigure, + (int)DefaultFailureStates.ContinueAndReschedule, + (int)DefaultFailureStates.Complete + }; + + private readonly static int[] IsFinalState = new int[] + { + (int)DefaultFailureStates.Complete + }; + + /// + /// Default failure state machine starting with 0 data points and in continue state. + /// + [Inject] + public DefaultFailureStateMachine() : this(0, DefaultFailureStates.Continue) + { + } + + /// + /// Default failure stata machine starting with a given amount of data points and a given + /// intial state. + /// + /// The number of initial data points for the machine, 0 by default + /// The initial state, continue by default + public DefaultFailureStateMachine( + int initalPoints = 0, + DefaultFailureStates initalState = DefaultFailureStates.Continue) + { + NumOfDataPoints = initalPoints; + NumOfFailedDataPoints = initalPoints; + State = new DefaultFailureState((int)initalState); + } + + /// + /// The machine current failure state. + /// + public IFailureState State { get; private set; } + + /// + /// The total number of data points the machine was initialized with. + /// + public int NumOfDataPoints { get; private set; } + + /// + /// The current number of data points data not reachable because of failures. + /// > + public int NumOfFailedDataPoints { get; private set; } + + /// + /// Add new data point(s) to the failure machine. + /// This method can be called either at initialization, or when + /// new data points becomes available at runtime e.g., after a failure + /// is resolved. + /// + /// How many data point to add + /// Whether the data point is new or restored from a previous failed points + /// The failure state resulting from the addition of the data points + public IFailureState AddDataPoints(int points, bool isNew) + { + lock (_statusLock) + { + if (IsFinalState.Contains(State.FailureState)) + { + return State; + } + + if (isNew) + { + NumOfDataPoints += points; + } + else + { + NumOfFailedDataPoints -= points; + } + if (State.FailureState > (int)DefaultFailureStates.Continue && + State.FailureState <= (int)DefaultFailureStates.Fail) + { + float currentRate = (float)NumOfFailedDataPoints / NumOfDataPoints; + + while (State.FailureState > (int)DefaultFailureStates.Continue && + currentRate < transitionWeights[(DefaultFailureStates)State.FailureState]) + { + State.FailureState = (int)TransitionMapDown[(DefaultFailureStates)State.FailureState]; + } + } + + return State; + } + } + + /// + /// Remove data point(s) from the failure machine as a result of a runtime failure. + /// + /// How many data point to remove + /// A failure event resulting from the removal of the data points + public IFailureState RemoveDataPoints(int points) + { + lock (_statusLock) + { + NumOfFailedDataPoints += points; + + float currentRate = (float)NumOfFailedDataPoints / NumOfDataPoints; + + if (IsFinalState.Contains(State.FailureState) && + currentRate >= transitionWeights[DefaultFailureStates.StopAndReschedule]) + { + throw new IllegalStateException("Received remove data point when state is complete: failing."); + } + + while (State.FailureState < (int)DefaultFailureStates.Fail && + currentRate > transitionWeights[TransitionMapUp[(DefaultFailureStates)State.FailureState]]) + { + State.FailureState = (int)TransitionMapUp[(DefaultFailureStates)State.FailureState]; + } + + return State; + } + } + + /// + /// Signal the state machine to move into complete state. + /// + public IFailureState Complete() + { + lock (_statusLock) + { + if (CanMoveToComplete.Contains(State.FailureState)) + { + State.FailureState = (int)DefaultFailureStates.Complete; + } + else + { + throw new IllegalStateException( + $"Failure machine cannot move from state {State.FailureState} to Complete: failing."); + } + } + + return State; + } + + /// + /// Method used to set or update the current threshold connected with + /// a target failure state. The assumption is that higher failure states + /// have higher thresholds. + /// + /// The failure state we want to change + /// A [0, 1] value specifying when the failure level is reached + public void SetThreshold(IFailureState level, float threshold) + { + if (!(level is DefaultFailureState)) + { + throw new ArgumentException(level.GetType() + " is not DefaultFailureState."); + } + + if (level.FailureState.IsContinue()) + { + throw new ArgumentException("Cannot change the threshold for Continue state."); + } + + lock (_statusLock) + { + transitionWeights[(DefaultFailureStates)level.FailureState] = threshold; + + CheckConsistency(); + } + } + + /// + /// A utility method for setting multiple threshold at once. + /// + /// Pairs of failure states with related new thresholds + public void SetThresholds(params Tuple[] weights) + { + foreach (var weight in weights) + { + if (!(weight.Item1 is DefaultFailureState)) + { + throw new ArgumentException("Input is not of type DefaultFailureState."); + } + if (weight.Item1.FailureState.IsContinue()) + { + throw new ArgumentException("Cannot change the threshold for Continue state."); + } + } + + lock (_statusLock) + { + foreach (Tuple weight in weights) + { + transitionWeights[(DefaultFailureStates)weight.Item1.FailureState] = weight.Item2; + } + + CheckConsistency(); + } + } + + /// + /// Utility method used to clone the target failure machine. + /// Only the thresholds are cloned, while the machine state is not. + /// + /// How many data points are avaialble in the new state machine + /// The state from which the new machine should start + /// A new failure machine with the same settings + public IFailureStateMachine Clone( + int initalPoints = 0, + int initalState = (int)DefaultFailureStates.Continue) + { + var newMachine = new DefaultFailureStateMachine(initalPoints, (DefaultFailureStates)initalState); + + foreach (DefaultFailureStates state in transitionWeights.Keys) + { + newMachine.SetThreshold(new DefaultFailureState((int)state), transitionWeights[state]); + } + + return newMachine; + } + + /// + /// Check if the states and related thresholds and consistent: i.e., each state can move + /// up or down to only one other state. + /// + private void CheckConsistency() + { + lock (_statusLock) + { + var state = DefaultFailureStates.ContinueAndReconfigure; + float prevWeight = transitionWeights[state]; + state = TransitionMapUp[state]; + + while (transitionWeights.TryGetValue(state, out float nextWeight)) + { + if (nextWeight < prevWeight) + { + throw new IllegalStateException( + $"State {TransitionMapDown[state]} weight is bigger than state {state}."); + } + + prevWeight = nextWeight; + + if (state == DefaultFailureStates.StopAndReschedule) + { + return; + } + + state = TransitionMapUp[state]; + } + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStates.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStates.cs new file mode 100644 index 0000000000..e68e00b207 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStates.cs @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Failures.Default +{ + /// + /// The default failure states. + /// + [Unstable("0.16", "The default states may change")] + public enum DefaultFailureStates : int + { + Continue = 0, // When a failre is detected, just ignore it continue the computation + + ContinueAndReconfigure = 1, // When a failre is detected, continue the computation by properly reconfiguring the operators + + ContinueAndReschedule = 2, // When a failre is detected, continue the computation by reconfiguring the operators and try to reschedule the task + + StopAndReschedule = 3, // When a failre is detected, stop the computation and try to reschedule the task + + Fail = 4, // Fail + + Complete = 5 // Complete, final state + + + } + + /// + /// Extension methods for default failure states. + /// + public static class DefaultFailureStateExtensions + { + /// + /// Whether the current failure state is Continue. + /// + /// The current failure state + /// True if is Continue + public static bool IsContinue(this int state) + { + return state == (int)DefaultFailureStates.Continue; + } + + /// + /// Whether the current failure state is ContinueAndReconfigure. + /// + /// The current failure state + /// True if is ContinueAndReconfigure + public static bool IsContinueAndReconfigure(this int state) + { + return state == (int)DefaultFailureStates.ContinueAndReconfigure; + } + + /// + /// Whether the current failure state is ContinueAndReschedule. + /// + /// The current failure state + /// True if is ContinueAndReschedule + public static bool IsContinueAndReschedule(this int state) + { + return state == (int)DefaultFailureStates.ContinueAndReschedule; + } + + /// + /// Whether the current failure state is Fail. + /// + /// The current failure state + /// True if is Fail + public static bool IsFail(this int state) + { + return state == (int)DefaultFailureStates.Fail; + } + + /// + /// Whether the current failure state is Complete. + /// + /// The current failure state + /// True if is Complete + public static bool IsComplete(this int state) + { + return state == (int)DefaultFailureStates.Complete; + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/FailEvent.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/FailEvent.cs new file mode 100644 index 0000000000..9d7763f006 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/FailEvent.cs @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Failures.Default +{ + /// + /// Faile the current execution. + /// + [Unstable("0.16", "API may change")] + internal class FailEvent : IFailureEvent + { + /// + /// Constructor for the faile event. + /// + /// The identifier of the task triggering the failure + public FailEvent(string taskId) + { + TaskId = taskId; + } + + /// + /// The event / action raised by the transition to the new failure state. + /// + public int FailureEvent + { + get { return (int)DefaultFailureStateEvents.Fail; } + } + + /// + /// The identifier of the task triggering the event. + /// + public string TaskId { get; } + + /// + /// The opeartor id in which the failure is rised. + /// + public int OperatorId + { + get { return -1; } + } + + /// + /// Messages implementing the response from the driver to the tasks + /// to reconfigure the compution. + /// + public List FailureResponse { get; } = new List(); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/IDefaultFailureEventResponse.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/IDefaultFailureEventResponse.cs new file mode 100644 index 0000000000..6f3d5d7f56 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/IDefaultFailureEventResponse.cs @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Failures.Default +{ + /// + /// Default failures response interface. + /// The default events are Reconfigure, Reschedule, Stop and Fail. + /// Mechanisms implementing the default failure responses must extend this interface. + /// + [Unstable("0.16", "API may change")] + internal interface IDefaultFailureEventResponse + { + /// + /// Mechanism to execute when a reconfigure event is triggered. + /// + /// + void OnReconfigure(ref ReconfigureEvent reconfigureEvent); + + /// + /// Mechanism to execute when a reschedule event is triggered. + /// + /// + void OnReschedule(ref RescheduleEvent rescheduleEvent); + + /// + /// Mechanism to execute when a stop event is triggered. + /// + /// + void OnStop(ref StopEvent stopEvent); + + /// + /// Mechanism to execute when a fail event is triggered. + /// + void OnFail(); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/ReconfigureEvent.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/ReconfigureEvent.cs new file mode 100644 index 0000000000..3c9a2579df --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/ReconfigureEvent.cs @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Failures.Default +{ + /// + /// Reconfigure the execution to work with fewer tasks. + /// + [Unstable("0.16", "API may change")] + public class ReconfigureEvent : IFailureEvent + { + /// + /// Constructor for a reconfigure event. + /// + /// The failed task + /// The operator identifier in which the event was detected + public ReconfigureEvent(IFailedTask failedTask, int operatorId) + { + FailedTask = Optional.OfNullable(failedTask); + TaskId = failedTask?.Id; + OperatorId = operatorId; + } + + /// + /// The event / action raised by the transition to the new failure state. + /// + public virtual int FailureEvent + { + get { return (int)DefaultFailureStateEvents.Reconfigure; } + } + + /// + /// The failed task triggering the event. + /// + public Optional FailedTask { get; set; } + + /// + /// The iteration in which the failure is rised. + /// + public int? Iteration { get; set; } = null; + + /// + /// The identifier of the task triggering the event. + /// + public string TaskId { get; protected set; } + + /// + /// The opeartor id in which the failure is rised. + /// + public int OperatorId { get; protected set; } + + /// + /// The response message generated to react to the failure event. + /// + public List FailureResponse { get; protected set; } = new List(); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/RescheduleEvent.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/RescheduleEvent.cs new file mode 100644 index 0000000000..fc4247d4b7 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/RescheduleEvent.cs @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System.Collections.Generic; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Failures.Default +{ + /// + /// Reconfigure the execution to work with fewer tasks and simultaneusly try to + /// reschedule a new task. + /// + [Unstable("0.16", "API may change")] + public class RescheduleEvent : ReconfigureEvent + { + /// + /// Constructor for the reschedule event. + /// + /// The identifier of the task triggering the failure event + public RescheduleEvent(string taskId) : base(null, -1) + { + TaskId = taskId; + RescheduleTaskConfigurations = new Dictionary>(); + } + + /// + /// The event / action raised by the transition to the new failure state. + /// + public override int FailureEvent + { + get { return (int)DefaultFailureStateEvents.Reschedule; } + } + + /// + /// The configurations for the stages of the task. + /// + public Dictionary> RescheduleTaskConfigurations { get; private set; } + + /// + /// Whether the task should be rescheduled as consequence of this event. + /// + public bool Reschedule + { + get { return RescheduleTaskConfigurations.Count > 0; } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/StopEvent.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/StopEvent.cs new file mode 100644 index 0000000000..eda798ac9a --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/StopEvent.cs @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Failures.Default +{ + /// + /// Stop the execution and try to add new tasks. + /// + [Unstable("0.16", "API may change")] + public sealed class StopEvent : RescheduleEvent + { + /// + /// Constructor for the stop event. + /// + /// The identifier of the task triggering the failure event + public StopEvent(string taskId) : base(taskId) + { + TaskId = taskId; + OperatorId = -1; + FailureResponse = new List(); + } + + /// + /// The event / action raised by the transition to the new failure state. + /// + public override int FailureEvent + { + get { return (int)DefaultFailureStateEvents.Stop; } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/CheckpointLevel.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/CheckpointLevel.cs new file mode 100644 index 0000000000..ad32568308 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/CheckpointLevel.cs @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Failures.Enum +{ + /// + /// Definition of supported checkpointing policies. + /// + [Unstable("0.16", "Policies may change")] + public enum CheckpointLevel : int + { + None = 0, // No checkpointing + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs new file mode 100644 index 0000000000..99766d0e0d --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs @@ -0,0 +1,267 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Tang.Implementations.InjectionPlan; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Utilities.Collections; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Wake.RX.Impl; +using Org.Apache.REEF.Wake.Time.Event; +using Org.Apache.REEF.Wake.Time.Runtime.Event; +using Org.Apache.REEF.Wake.Time; +using Org.Apache.REEF.Wake.Time.Runtime; + +namespace Org.Apache.REEF.Network.Elastic.Failures +{ + /// + /// Clock used to trigger failures events. + /// + internal sealed class FailuresClock : IClock + { + private static readonly Logger Log = Logger.GetLogger(typeof(FailuresClock)); + + private static int numberOfInstantiations = 0; + + private readonly ITimer _timer; + private readonly PubSubSubject - /// A value identifing the failure state int FailureState { get; set; } /// diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureStateMachine.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureStateMachine.cs index 3126f63e24..ef3f7a3716 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureStateMachine.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureStateMachine.cs @@ -15,21 +15,28 @@ // specific language governing permissions and limitations // under the License. -using System; +using Org.Apache.REEF.Network.Elastic.Failures.Default; +using Org.Apache.REEF.Tang.Annotations; using Org.Apache.REEF.Utilities.Attributes; +using System; namespace Org.Apache.REEF.Network.Elastic.Failures { /// /// Where the decision is made on what to do when a failure happen. - /// A decision is made based on the ration between the initial data points + /// A decision is made based on the ratio between the initial data points /// and how many data points are lost. + /// Decisions are in form of failure states and threshold levels. + /// Failure machines should work as ladders, when some data is lost and the number + /// of available data points move below / above one of the threshold, the state of the + /// machine changes. /// [Unstable("0.16", "API may change")] + [DefaultImplementation(typeof(DefaultFailureStateMachine))] public interface IFailureStateMachine { /// - /// The Machine current failure state. + /// The machine current failure state. /// IFailureState State { get; } @@ -46,47 +53,50 @@ public interface IFailureStateMachine /// /// Method used to set or update the current threshold connected with /// a target failure state. The assumption is that higher failure states - /// have higher thresholds. + /// have higher thresholds. If multiple threshould need to be changed, use + /// the SetThresholds method instead. /// /// The failure state we want to change /// A [0, 1] value specifying when the failure level is reached - void SetThreashold(IFailureState level, float threshold); + void SetThreshold(IFailureState level, float threshold); /// - /// A utility method for setting multiple threshould at once. + /// A utility method for setting multiple threshold at once. + /// This method is appropriate when multiple threshould needs to be setted at once. /// - /// Pairs of failure states with realted new threshold - void SetThreasholds(Tuple[] weights); + /// Pairs of failure states with related new thresholds + void SetThresholds(params Tuple[] weights); /// - /// Add new data point(s) to the Failure Machine. + /// Add new data point(s) to the failure machine. /// This method can be called either at initialization, or when /// new data points becomes available at runtime e.g., after a failure /// is resolved. /// /// How many data point to add + /// Whether the data point is new or restored from a previous failed points /// The failure state resulting from the addition of the data points - IFailureState AddDataPoints(int points); + IFailureState AddDataPoints(int points, bool isNew); /// - /// Remove data point(s) from the Failure Machine as a result of a runtime failure. + /// Remove data point(s) from the failure machine as a result of a runtime failure. /// /// How many data point to remove - /// The failure state resulting from the removal of the data points + /// A failure event resulting from the removal of the data points IFailureState RemoveDataPoints(int points); /// - /// Finalizes the Failure Machine. - /// Once finalized, each newly added data point is considered as resolving a failure. + /// Signal the state machine to move into complete state. /// - /// The same finalized Failure Machine - IFailureStateMachine Build(); + IFailureState Complete(); /// /// Utility method used to clone the target failure machine. /// Only the thresholds are cloned, while the machine state is not. /// + /// How many data points are avaialble in the new state machine + /// The state from which the new machine should start /// A new failure machine with the same settings - IFailureStateMachine Clone(); + IFailureStateMachine Clone(int initalPoints = 0, int initalState = 0); } -} \ No newline at end of file +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ITimeout.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ITimeout.cs new file mode 100644 index 0000000000..649edb6550 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ITimeout.cs @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Wake.Time.Event; + +namespace Org.Apache.REEF.Network.Elastic.Failures +{ + /// + /// Failure event due to a timeout. + /// + [Unstable("0.16", "API may change")] + public interface ITimeout + { + /// + /// Method used to schedule a timer event of the proper type. + /// + /// How long to wait before the timer event is triggered + /// A timer event + Alarm GetAlarm(long timeout); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/OperatorException.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/OperatorException.cs new file mode 100644 index 0000000000..81146b4683 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/OperatorException.cs @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; +using System; +using System.Runtime.Serialization; + +namespace Org.Apache.REEF.Network.Elastic.Failures +{ + /// + /// A serializable exception that represents a task operator error. + /// + [Serializable] + [Unstable("0.16", "API may change")] + public class OperatorException : Exception, ISerializable + { + public readonly int _id; + public readonly string _additionalInfo; + + /// + /// Constructor. A serializable exception object that represents a task operator error. + /// All the operator related errors should be captured in this type of exception in order + /// to be proprierly handled by the elastic framework. + /// The exception message + /// The id of the operator where the exception is triggered + /// + public OperatorException(string message, int id) + : base(GetMessagePrefix(id) + message) + { + _id = id; + } + + /// + /// Constructor. A serializable exception object that represents a task operator error and wraps an inner exception. + /// + /// The exception message + /// The id of the operator where the exception is triggered + /// Inner exception + public OperatorException(string message, int id, Exception innerException) + : base(GetMessagePrefix(id) + message, innerException) + { + _id = id; + } + + /// + /// Constructor. A serializable exception object that represents a task operator error and wraps an inner exception + /// plus some additional operator specific information. + /// + /// The exception message + /// The id of the operator where the exception is triggered + /// Inner exception + /// Additional operator speicifc information on the failure + public OperatorException(string message, int id, Exception innerException, string info) + : base(GetMessagePrefix(id) + message, innerException) + { + _id = id; + _additionalInfo = info; + } + + /// + /// Constructor that generate an operator exception from a serialized buffer. + /// + /// The buffer containing the exception information + /// The streaming context + public OperatorException(SerializationInfo info, StreamingContext context) + : base(info, context) + { + _id = info.GetInt32("id"); + _additionalInfo = info.GetString("info"); + } + + /// + /// The identifier of the operator throwing the exception. + /// + public int OperatorId + { + get { return _id; } + } + + /// + /// Some additional info for the exception. + /// + public string AdditionalInfo + { + get { return _additionalInfo; } + } + + /// + /// Serialize the exception. + /// + /// The buffer where to add the exception information + /// The streaming context + public new void GetObjectData(SerializationInfo info, StreamingContext context) + { + base.GetObjectData(info, context); + info.AddValue("id", _id, typeof(int)); + info.AddValue("info", _additionalInfo, typeof(string)); + } + + private static string GetMessagePrefix(int id) + { + return "Operator " + id + " : "; + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultBroadcast.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultBroadcast.cs new file mode 100644 index 0000000000..b525121cd8 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultBroadcast.cs @@ -0,0 +1,95 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Topology.Logical; +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Tang.Implementations.Configuration; +using Org.Apache.REEF.Tang.Implementations.Tang; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Logical.Default +{ + /// + /// Driver-side broadcast operator implementation. + /// + [Unstable("0.16", "API may change")] + internal sealed class DefaultBroadcast : DefaultOneToN, IElasticBroadcast + { + /// + /// Constructor for a driver-side broadcast opearator. + /// + /// The identifier of the sender task + /// The previous operator in the pipeline + /// The topology for the broadcast operation + /// The failure machine managing the failures for the operator + /// The checkpoint level + /// Additional configurations for the operator + public DefaultBroadcast( + int senderId, + ElasticOperator prev, + ITopology topology, + IFailureStateMachine failureMachine, + CheckpointLevel checkpointLevel, + params IConfiguration[] configurations) : base( + senderId, + prev, + topology, + failureMachine, + checkpointLevel, + configurations) + { + OperatorType = OperatorType.Broadcast; + } + + /// + /// Generate the data serializer configuration for the target operator. + /// + /// The conf builder where to attach the codec configuration + internal override void GetCodecConfiguration(ref IConfiguration conf) + { + if (CodecMap.TryGetValue(typeof(T), out IConfiguration codecConf)) + { + conf = Configurations.Merge(conf, codecConf); + base.GetCodecConfiguration(ref conf); + } + else + { + throw new IllegalStateException($"Codec for type {typeof(T)} not found."); + } + } + + /// + /// Binding from logical to physical operator. + /// + /// The configuration builder the binding will be added to + /// The physcal operator configurations + protected override IConfiguration PhysicalOperatorConfiguration() + { + var physicalOperatorConf = TangFactory.GetTang().NewConfigurationBuilder() + .BindImplementation, Physical.Default.DefaultBroadcast>() + .Build(); + var messageconf = SetMessageType(); + + return Configurations.Merge(physicalOperatorConf, messageconf); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultEmpty.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultEmpty.cs new file mode 100644 index 0000000000..50c9d36319 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultEmpty.cs @@ -0,0 +1,101 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Network.Elastic.Driver; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Network.Elastic.Topology.Logical.Impl; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Logical.Default +{ + /// + /// Empty operator implementing the default failure logic. To use only as root of pipelines. + /// + [Unstable("0.16", "API may change")] + internal class DefaultEmpty : ElasticOperatorWithDefaultDispatcher + { + /// + /// Basic constructor for the empty operator. + /// + /// The stage the operator is part of + /// The failure machine goverining the opeartor + public DefaultEmpty(IElasticStage stage, IFailureStateMachine failureMachine) : + base(stage, null, new EmptyTopology(), failureMachine) + { + OperatorType = OperatorType.Empty; + MasterId = 1; + WithinIteration = false; + } + + /// + /// Used to react on a failure occurred on a task. + /// It gets a failed task as input and in response it produces zero or more failure events. + /// + /// The failed task + /// A list of events encoding the type of actions to be triggered so far + /// If the task failure cannot be properly handled + public override void OnTaskFailure(IFailedTask task, ref List failureEvents) + { + _next?.OnTaskFailure(task, ref failureEvents); + } + + /// + /// Logs the current operator state. + /// + protected override void LogOperatorState() + { + } + + /// + /// This method is operator specific and serializes the operator configuration into the input list. + /// + /// A list the serialized operator configuration will be appended to + /// The task id of the task that belongs to this operator + protected override void GetOperatorConfiguration(ref IList serializedOperatorsConfs, int taskId) + { + } + + /// + /// Binding from logical to physical operator. + /// + /// The configuration builder the binding will be added to + /// The physcal operator configurations + protected override IConfiguration PhysicalOperatorConfiguration() + { + return TangFactory.GetTang().NewConfigurationBuilder().Build(); + } + + /// + /// Utility method gathering the set of master task ids of the operators in the current pipeline. + /// + /// The id of the master tasks of the current and successive operators + internal override void GatherMasterIds(ref HashSet masterTasks) + { + if (!_operatorFinalized) + { + throw new IllegalStateException("Operator need to be build before finalizing the stage"); + } + + _next?.GatherMasterIds(ref masterTasks); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs new file mode 100644 index 0000000000..87d6beef5c --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs @@ -0,0 +1,219 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Network.Elastic.Topology.Logical; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Driver.Task; +using System.Collections.Generic; +using System; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Network.Elastic.Comm.Enum; +using Org.Apache.REEF.Network.Elastic.Failures.Default; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Network.Elastic.Config; +using System.Linq; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Logical.Default +{ + /// + /// Generic implementation of an operator having one node sending to N nodes + /// and with default failure behaviour. + /// + [Unstable("0.16", "API may change")] + internal abstract class DefaultOneToN : ElasticOperatorWithDefaultDispatcher + { + private static readonly Logger Log = Logger.GetLogger(typeof(DefaultOneToN<>)); + + private volatile bool _stop = false; + + /// + /// Constructor for an operator where one node sends to N nodes and with default + /// failure behavior. + /// + /// The identifier of the task sending the message + /// The previous node in the pipeline + /// The toopology the message routing protocol will use + /// The failure machine for this operator + /// The checkpoint level for the operator + /// Additional operator specific configurations + public DefaultOneToN( + int senderId, + ElasticOperator prev, + ITopology topology, + IFailureStateMachine failureMachine, + CheckpointLevel checkpointLevel, + params IConfiguration[] configurations) : base( + null, + prev, + topology, + failureMachine, + checkpointLevel, + configurations) + { + MasterId = senderId; + WithinIteration = prev.WithinIteration; + } + + /// + /// Operator specific logic for reacting when a task message is received. + /// + /// Incoming message from a task + /// Zero or more reply messages for the task + /// True if the operator has reacted to the task message + protected override bool ReactOnTaskMessage( + ITaskMessage message, + out IEnumerable returnMessages) + { + var offset = BitConverter.ToUInt16(message.Message, 0); + offset += sizeof(ushort); + var msgReceived = (TaskMessageType)BitConverter.ToUInt16(message.Message, offset); + offset += sizeof(ushort); + + returnMessages = new List(); + + switch (msgReceived) + { + case TaskMessageType.JoinTopology: + { + var operatorId = BitConverter.ToInt16(message.Message, offset); + + if (operatorId != _id) + { + return false; + } + + if (!Stage.IsCompleted && _failureMachine.State.FailureState < (int)DefaultFailureStates.Fail) + { + var taskId = message.TaskId; + Log.Log(Level.Info, "{0} joins the topology for operator {1}", taskId, _id); + + _topology.AddTask(taskId, _failureMachine); + } + + return true; + } + case TaskMessageType.TopologyUpdateRequest: + { + var operatorId = BitConverter.ToInt16(message.Message, offset); + + if (operatorId != _id) + { + return false; + } + + Log.Log(Level.Info, "Received topology update request for {0} {1} from {2}", + OperatorType, _id, message.TaskId); + + ((List)returnMessages).AddRange(_topology.TopologyUpdateResponse( + message.TaskId, + Optional.Of(_failureMachine))); + + if (_stop) + { + if (_failureMachine.State.FailureState < (int)DefaultFailureStates.StopAndReschedule) + { + _stop = false; + } + else + { + ((List)returnMessages).Clear(); // Remove all messages. + Log.Log(Level.Info, "Operator {0} is in stopped: Waiting.", OperatorType); + } + } + + return true; + } + case TaskMessageType.CompleteStage: + { + _failureMachine.Complete(); + Stage.Complete(); + + return true; + } + + default: + return false; + } + } + + /// + /// Mechanism to execute when a reconfigure event is triggered. + /// + /// + public override void OnReconfigure(ref ReconfigureEvent reconfigureEvent) + { + Log.Log(Level.Info, "Going to reconfigure the {0} operator", OperatorType); + + if (reconfigureEvent.FailedTask.IsPresent()) + { + var error = reconfigureEvent.FailedTask.Value.AsError() as OperatorException; + + reconfigureEvent.FailureResponse.AddRange( + _topology.Reconfigure( + reconfigureEvent.FailedTask.Value.Id, + error?.AdditionalInfo, + reconfigureEvent.Iteration)); + } + } + + /// + /// Mechanism to execute when a reschedule event is triggered. + /// + /// + public override void OnReschedule(ref RescheduleEvent rescheduleEvent) + { + // Iterators manage the re-schuedling of tasks. If not iterator exists, setup the rescheduling. + if (!WithinIteration) + { + Log.Log(Level.Info, "Going to reschedule task {0}", rescheduleEvent.TaskId); + + if (!rescheduleEvent.RescheduleTaskConfigurations.TryGetValue( + Stage.StageName, + out IList confs)) + { + confs = new List(); + rescheduleEvent.RescheduleTaskConfigurations.Add(Stage.StageName, confs); + } + confs.Add(TangFactory.GetTang().NewConfigurationBuilder() + .BindNamedParam("true") + .Build()); + } + + var reconfigureEvent = rescheduleEvent as ReconfigureEvent; + + OnReconfigure(ref reconfigureEvent); + } + + /// + /// Mechanism to execute when a stop event is triggered. + /// + /// + public override void OnStop(ref StopEvent stopEvent) + { + _stop = true; + + var rescheduleEvent = stopEvent as RescheduleEvent; + + OnReschedule(ref rescheduleEvent); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElasticOperatorWithDefaultDispatcher.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElasticOperatorWithDefaultDispatcher.cs new file mode 100644 index 0000000000..1dbd287b81 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElasticOperatorWithDefaultDispatcher.cs @@ -0,0 +1,259 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Driver; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.Elastic.Topology.Logical; +using System.Collections.Generic; +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Wake.Time.Event; +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Network.Elastic.Failures.Default; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Logical.Default +{ + /// + /// Abstract operator implementing the default failure logic. + /// This can be used as super class for default operators. + /// + [Unstable("0.16", "API may change")] + internal abstract class ElasticOperatorWithDefaultDispatcher : ElasticOperator, IDefaultFailureEventResponse + { + private static readonly Logger Log = Logger.GetLogger(typeof(ElasticOperatorWithDefaultDispatcher)); + + /// + /// Base constructor for an abstract operator implementing the default failure logic. + /// + /// The stage the operator is part of + /// The previous operator in the pipelines + /// The topology for the operator + /// The failure machine of the operator + /// The chckpoint level for the opearator + /// Additonal opeartor specific configurations + protected ElasticOperatorWithDefaultDispatcher( + IElasticStage stage, + ElasticOperator prev, ITopology topology, + IFailureStateMachine failureMachine, + CheckpointLevel checkpointLevel = CheckpointLevel.None, + params IConfiguration[] configurations) : + base(stage, prev, topology, failureMachine, checkpointLevel, configurations) + { + } + + /// + /// Add the broadcast operator to the operator pipeline. + /// + /// The type of messages that the operator will send / receive + /// The id of the sender / root node of the broadcast + /// The topology of the operator + /// The failure state machine of the operator + /// The checkpoint policy for the operator + /// Additional configurations for the operator + /// The same operator pipeline with the added broadcast operator + public override ElasticOperator Broadcast( + int senderId, + ITopology topology, + IFailureStateMachine failureMachine, + CheckpointLevel checkpointLevel, + params IConfiguration[] configurations) + { + _next = new DefaultBroadcast(senderId, this, topology, failureMachine, checkpointLevel, configurations); + return _next; + } + + /// + /// Used to react on a failure occurred on a task. + /// It gets a failed task as input and in response it produces zero or more failure events. + /// + /// The failed task + /// A list of events encoding the type of actions to be triggered so far + /// If the task failure cannot be properly handled + public override void OnTaskFailure(IFailedTask task, ref List failureEvents) + { + var failedOperatorId = (task.AsError() as OperatorException)?.OperatorId ?? _id; + + if (WithinIteration || failedOperatorId <= _id) + { + int lostDataPoints = _topology.RemoveTask(task.Id); + var failureState = _failureMachine.RemoveDataPoints(lostDataPoints); + + switch ((DefaultFailureStates)failureState.FailureState) + { + case DefaultFailureStates.ContinueAndReconfigure: + failureEvents.Add(new ReconfigureEvent(task, _id)); + break; + + case DefaultFailureStates.ContinueAndReschedule: + if (failedOperatorId == _id) + { + var @event = new Failures.Default.RescheduleEvent(task.Id) + { + FailedTask = Optional.Of(task) + }; + failureEvents.Add(@event); + } + break; + + case DefaultFailureStates.StopAndReschedule: + { + var @event = new StopEvent(task.Id); + if (failedOperatorId == _id) + { + @event.FailedTask = Optional.Of(task); + } + failureEvents.Add(@event); + } + break; + + case DefaultFailureStates.Fail: + failureEvents.Add(new FailEvent(task.Id)); + break; + + default: + Log.Log(Level.Info, "Failure from {0} requires no action", task.Id); + break; + } + + LogOperatorState(); + } + + if (PropagateFailureDownstream()) + { + _next?.OnTaskFailure(task, ref failureEvents); + } + } + + /// + /// Used to react when a timeout event is triggered. + /// + /// The alarm triggering the timeput + /// A list of messages encoding how remote tasks need to react + /// The next timeouts to be scheduled + public override void OnTimeout( + Alarm alarm, + ref List msgs, + ref List nextTimeouts) + { + _next?.OnTimeout(alarm, ref msgs, ref nextTimeouts); + } + + /// + /// When a new failure state is reached, this method is used to dispatch + /// such event to the proper failure mitigation logic. + /// It gets a failure event as input and produces zero or more failure response messages + /// for tasks (appended into the event). + /// + /// The failure event to react upon + public override void EventDispatcher(ref IFailureEvent @event) + { + if (@event.OperatorId == _id || @event.OperatorId < 0) + { + switch ((DefaultFailureStateEvents)@event.FailureEvent) + { + case DefaultFailureStateEvents.Reconfigure: + var rec = @event as ReconfigureEvent; + OnReconfigure(ref rec); + break; + + case DefaultFailureStateEvents.Reschedule: + var res = @event as RescheduleEvent; + OnReschedule(ref res); + break; + + case DefaultFailureStateEvents.Stop: + var stp = @event as StopEvent; + OnStop(ref stp); + break; + + default: + OnFail(); + break; + } + } + + if (@event.OperatorId == -1 || @event.OperatorId > _id) + { + _next?.EventDispatcher(ref @event); + } + } + + /// + /// Mechanism to execute when a reconfigure event is triggered. + /// + /// + public virtual void OnReconfigure(ref ReconfigureEvent reconfigureEvent) + { + } + + /// + /// Mechanism to execute when a reschedule event is triggered. + /// + /// + public virtual void OnReschedule(ref RescheduleEvent rescheduleEvent) + { + } + + /// + /// Mechanism to execute when a stop event is triggered. + /// + /// + public virtual void OnStop(ref StopEvent stopEvent) + { + } + + /// + /// Mechanism to execute when a fail event is triggered. + /// + public virtual void OnFail() + { + } + + /// + /// Returns whether a failure should be propagated to downstream operators or not. + /// + /// True if the failure has to be sent downstream + protected override bool PropagateFailureDownstream() + { + return _failureMachine.State.FailureState.IsContinue() || + _failureMachine.State.FailureState.IsContinueAndReconfigure() || + _failureMachine.State.FailureState.IsContinueAndReschedule(); + } + + /// + /// Logs the current operator state. + /// + protected override void LogOperatorState() + { + if (Log.IsLoggable(Level.Info)) + { + Log.Log(Level.Info, + "State for Operator {0} in Stage {1}:\n" + + "Topology:\n{2}\n" + + "Failure State: {3}\n" + + "Failure(s) Reported: {4}/{5}", + OperatorType, Stage.StageName, _topology.LogTopologyState(), + (DefaultFailureStates)_failureMachine.State.FailureState, + _failureMachine.NumOfFailedDataPoints, _failureMachine.NumOfDataPoints); + } + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs new file mode 100644 index 0000000000..865b3aba71 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs @@ -0,0 +1,525 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Network.Elastic.Driver; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Utilities.Logging; +using System.Globalization; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Tang.Util; +using System.Collections.Generic; +using Org.Apache.REEF.Tang.Implementations.Configuration; +using Org.Apache.REEF.Network.Elastic.Topology.Logical; +using Org.Apache.REEF.Network.Elastic.Topology.Logical.Impl; +using Org.Apache.REEF.Network.Elastic.Config; +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Wake.Time.Event; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum; +using Org.Apache.REEF.Wake.StreamingCodec.CommonStreamingCodecs; +using Org.Apache.REEF.Wake.StreamingCodec; +using System.Linq; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Logical +{ + /// + /// Basic implementation for logical operators. + /// Each operator is part of a stage and is parametrized by a topology, a failure + /// state machine and a checkpoint policy. + /// Operators are composed into pipelines. + /// Once a pipeline is finalized, tasks can be added to the operator, which + /// will in turn add the tasks to the topology and the failure state machine. + /// When no more tasks are added, the operator state must be finalized in order to + /// schedule the pipeline for execution. + /// + [Unstable("0.16", "API may change")] + public abstract class ElasticOperator : IFailureResponse, ITaskMessageResponse + { + private static readonly Logger Log = Logger.GetLogger(typeof(ElasticOperator)); + + private static KeyValuePair Codec() + where TCodec : IStreamingCodec + { + return new KeyValuePair( + typeof(TType), StreamingCodecConfiguration.Conf + .Set(StreamingCodecConfiguration.Codec, GenericType.Class) + .Build()); + } + + private static Dictionary AsDictionary( + params KeyValuePair[] values) + { + return values.ToDictionary(kv => kv.Key, kv => kv.Value); + } + + protected static readonly Dictionary CodecMap = AsDictionary( + Codec(), + Codec(), + Codec(), + Codec() + ); + + // For the moment we consider only linear sequences (pipelines) of operators (no branching for e.g., joins) + protected ElasticOperator _next = null; + + protected readonly ElasticOperator _prev; + + protected readonly IFailureStateMachine _failureMachine; + protected readonly CheckpointLevel _checkpointLevel; + protected readonly ITopology _topology; + protected readonly int _id; + protected readonly IConfiguration[] _configurations; + + protected bool _operatorFinalized = false; + protected volatile bool _operatorStateFinalized = false; + protected IElasticStage _stage; + + /// + /// Specification for generic elastic operators. + /// + /// The stage this operator is part of + /// The previous operator in the pipeline + /// The topology of the operator + /// The behavior of the operator under failures + /// The checkpoint policy for the operator + /// Additional configuration parameters + public ElasticOperator( + IElasticStage stage, + ElasticOperator prev, + ITopology topology, + IFailureStateMachine failureMachine, + CheckpointLevel checkpointLevel = CheckpointLevel.None, + params IConfiguration[] configurations) + { + _stage = stage; + _prev = prev; + _id = Stage.GetNextOperatorId(); + _topology = topology; + _failureMachine = failureMachine; + _checkpointLevel = checkpointLevel; + _configurations = configurations; + + _topology.OperatorId = _id; + _topology.StageName = Stage.StageName; + } + + /// + /// The identifier of the master / coordinator node for this operator. + /// + public int MasterId { get; protected set; } + + /// + /// The operator type. + /// + public OperatorType OperatorType { get; protected set; } + + /// + /// Whether the current operator is or is preeceded by an iterator operator. + /// + public bool WithinIteration { get; protected set; } + + /// + /// The stage this operator is part of. + /// + public IElasticStage Stage + { + get + { + if (_stage == null) + { + + _stage = _prev?.Stage ?? throw new IllegalStateException("The reference to the parent stage is lost."); + + return _prev.Stage; + } + + return _stage; + } + } + + /// + /// Add an instance of the broadcast operator to the operator pipeline + /// with default failure machine and no checkpointing. + /// + /// The type of messages that the operator will send / receive + /// The topology of the operator + /// Additional configurations for the operator + /// The same operator pipeline with the added broadcast operator + public ElasticOperator Broadcast(TopologyType topology, params IConfiguration[] configurations) + { + return Broadcast( + MasterId, + GetTopology(topology), + _failureMachine.Clone(), + CheckpointLevel.None, + configurations); + } + + /// + /// Add the broadcast operator to the operator pipeline + /// with default failure machine. + /// + /// The type of messages that the operator will send / receive + /// The topology of the operator + /// The checkpoint policy for the operator + /// Additional configurations for the operator + /// The same operator pipeline with the added broadcast operator + public ElasticOperator Broadcast( + TopologyType topology, + CheckpointLevel checkpointLevel, + params IConfiguration[] configurations) + { + return Broadcast( + MasterId, + GetTopology(topology), + _failureMachine.Clone(), + checkpointLevel, + configurations); + } + + /// + /// Method triggered when a task to driver message is received. + /// This method eventually propagate tasks message through the pipeline. + /// + /// The task message for the operator + /// A list of messages containing the instructions for the task + /// True if the message was managed correctly, false otherwise + /// If the message cannot be handled correctly or + /// generate an incorrent state + public IEnumerable OnTaskMessage(ITaskMessage message) + { + if (!ReactOnTaskMessage(message, out IEnumerable returnMessages)) + { + return returnMessages.Concat(_next?.OnTaskMessage(message)); + } + + return returnMessages; + } + + /// + /// Add a task to the operator. + /// The operator must have called Build() before adding tasks. + /// + /// The id of the task to add + /// True if the task is new and is added to the operator + public virtual bool AddTask(string taskId) + { + var newTask = false; + + if (!_operatorFinalized) + { + throw new IllegalStateException("Operator needs to be finalized before adding tasks."); + } + + // If state is finalized, tasks can join the topology only explicitly. + newTask = _operatorStateFinalized || _topology.AddTask(taskId, _failureMachine); + + // A task is new if it got added by at least one operator. + return (_next?.AddTask(taskId) ?? true) || newTask; + } + + /// + /// Finalizes the operator. + /// + /// The same finalized operator + public virtual ElasticOperator Build() + { + if (_operatorFinalized) + { + throw new IllegalStateException("Operator cannot be built more than once."); + } + + _prev?.Build(); + + _operatorFinalized = true; + + return this; + } + + /// + /// Finalizes the operator state. After BuildState, no more tasks can be added + /// to the Operator. + /// + /// The same operator with the finalized state + public virtual ElasticOperator BuildState() + { + if (_operatorStateFinalized) + { + throw new IllegalStateException("Operator state cannot be built more than once."); + } + + if (!_operatorFinalized) + { + throw new IllegalStateException("Operator need to be build before finalizing its state."); + } + + _next?.BuildState(); + + _topology.Build(); + + LogOperatorState(); + + _operatorStateFinalized = true; + + return this; + } + + /// + /// Generate the data serializer configuration for the target operator. + /// + /// The conf builder where to attach the codec configuration + internal virtual void GetCodecConfiguration(ref IConfiguration conf) + { + _next?.GetCodecConfiguration(ref conf); + } + + /// + /// Whether this is the last iterator in the pipeline. + /// + /// True if this is the last iterator + public virtual bool CheckIfLastIterator() + { + return _next?.CheckIfLastIterator() ?? true; + } + + /// + /// Add the broadcast operator to the operator pipeline. + /// + /// The type of messages that the operator will send / receive + /// The id of the sender / root node of the broadcast + /// The topology of the operator + /// The failure state machine of the operator + /// The checkpoint policy for the operator + /// Additional configurations for the operator + /// The same operator pipeline with the added broadcast operator + public abstract ElasticOperator Broadcast( + int senderId, + ITopology topology, + IFailureStateMachine failureMachine, + CheckpointLevel checkpointLevel = CheckpointLevel.None, + params IConfiguration[] configurations); + + /// + /// Used to react on a failure occurred on a task. + /// It gets a failed task as input and in response it produces zero or more failure events. + /// + /// The failed task + /// A list of events encoding the type of actions to be triggered so far + /// If the task failure cannot be properly handled + public abstract void OnTaskFailure(IFailedTask task, ref List failureEvents); + + /// + /// Used to react when a timeout event is triggered. + /// It gets a failed task as input and in response it produces zero or more failure events. + /// + /// The alarm triggering the timeput + /// A list of messages encoding how remote Tasks need to reach + /// The next timeouts to be scheduled + public abstract void OnTimeout( + Alarm alarm, + ref List msgs, + ref List nextTimeouts); + + /// + /// When a new failure state is reached, this method is used to dispatch + /// such event to the proper failure mitigation logic. + /// It gets a failure event as input and produces zero or more failure response messages + /// for tasks (appended into the event). + /// + /// The failure event to react upon + public abstract void EventDispatcher(ref IFailureEvent @event); + + /// + /// Appends the operator configuration for the input task to the input configuration. + /// Must be called only after Build() and BuildState() have been called. + /// This method should be called from the root operator at beginning of the pipeline. + /// + /// The list the operator configuration we will be appending to + /// The id of the task that belongs to this operator + /// The configuration for the task with added operator information + internal void GetTaskConfiguration(ref IList serializedOperatorsConfs, int taskId) + { + if (_operatorFinalized && _operatorStateFinalized) + { + GetOperatorConfiguration(ref serializedOperatorsConfs, taskId); + + _next?.GetTaskConfiguration(ref serializedOperatorsConfs, taskId); + } + else + { + throw new IllegalStateException("Operator needs to be finalized before getting tasks configuration."); + } + } + + /// + /// Whether this operator is ready to be scheduled by the task set manager. + /// + /// True if the operator is ready to be scheduled + internal bool CanBeScheduled() + { + return _topology.CanBeScheduled() && (_next?.CanBeScheduled() ?? true); + } + + /// + /// Utility method gathering the set of master task ids of the operators in the current pipeline. + /// + /// The id of the master tasks of the current and successive operators + internal virtual void GatherMasterIds(ref HashSet masterTasks) + { + if (!_operatorFinalized) + { + throw new IllegalStateException("Operator need to be build before gathering information."); + } + + masterTasks.Add(Utils.BuildTaskId(Stage.StageName, MasterId)); + + _next?.GatherMasterIds(ref masterTasks); + } + + /// + /// Log the final statistics of the operator. + /// This is called when the pipeline execution is completed. + /// + internal virtual string LogFinalStatistics() + { + return LogInternalStatistics() + _next?.LogFinalStatistics(); + } + + /// + /// Appends the message type to the configuration. + /// + /// The conf builder with added the message type + protected IConfiguration SetMessageType() + { + return TangFactory.GetTang().NewConfigurationBuilder() + .BindStringNamedParam(typeof(TMsg).AssemblyQualifiedName) + .Build(); + } + + /// + /// Action to trigger when the operator recdeives a notification that a new iteration is started. + /// + /// The new iteration number + protected void OnNewIteration(int iteration) + { + _topology.OnNewIteration(iteration); + + _next?.OnNewIteration(iteration); + } + + /// + /// This method is operator specific and serializes the operator configuration into the input list. + /// + /// A list the serialized operator configuration will be + /// appended to + /// The task id of the task that belongs to this operator + protected virtual void GetOperatorConfiguration(ref IList serializedOperatorsConfs, int taskId) + { + var operatorBuilderWithTaskConf = _topology.GetTaskConfiguration(taskId); + var operatorBuilderWithTaskAndPhysicalConf = PhysicalOperatorConfiguration(); + IConfiguration operatorConf = TangFactory.GetTang().NewConfigurationBuilder() + .BindNamedParam("" + (!Stage.IsIterative && _next == null)) + .BindIntNamedParam("" + _id) + .BindIntNamedParam("" + (int)_checkpointLevel) + .Build(); + + operatorConf = Configurations.Merge( + operatorConf, + operatorBuilderWithTaskConf, + operatorBuilderWithTaskAndPhysicalConf, + Configurations.Merge(_configurations)); + + Stage.Context.SerializeOperatorConfiguration(ref serializedOperatorsConfs, operatorConf); + } + + /// + /// Returns whether a failure should be propagated to downstream operators or not. + /// + /// True if the failure has to be sent downstream + protected virtual bool PropagateFailureDownstream() + { + return true; + } + + /// + /// Operator specific logic for reacting when a task message is received. + /// + /// Incoming message from a task + /// Zero or more reply messages for the task + /// True if the operator has reacted to the task message + protected virtual bool ReactOnTaskMessage(ITaskMessage message, out IEnumerable returnMessages) + { + returnMessages = new IElasticDriverMessage[] { }; + return false; + } + + /// + /// Logs the current operator state. + /// + protected virtual void LogOperatorState() + { + if (Log.IsLoggable(Level.Info)) + { + Log.Log(Level.Info, + "State for Operator {0} in Stage {1}:\n" + + "Topology:\n{2}" + + "Failure State: {3}\n" + + "Failure(s) Reported: {4}", + OperatorType, Stage.StageName, _topology.LogTopologyState(), + _failureMachine.State.FailureState, _failureMachine.NumOfFailedDataPoints); + } + } + + /// + /// Log the final internal statistics of the operator. + /// + protected virtual string LogInternalStatistics() + { + return _topology.LogFinalStatistics(); + } + + /// + /// Binding from logical to physical operator. + /// + /// The physcal operator configuration + protected abstract IConfiguration PhysicalOperatorConfiguration(); + + private ITopology GetTopology(TopologyType topologyType) + { + ITopology topology; + + switch (topologyType) + { + case TopologyType.Flat: + topology = new FlatTopology(MasterId); + break; + + default: + throw new ArgumentException( + nameof(topologyType), + $"Topology type {topologyType} not supported by {OperatorType}."); + } + + return topology; + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/IElasticBroadcast.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/IElasticBroadcast.cs new file mode 100644 index 0000000000..6142ec1c1d --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/IElasticBroadcast.cs @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Logical +{ + /// + /// Elastic group communication operator used to broadcast messages. + /// + [Unstable("0.16", "API may change")] + public interface IElasticBroadcast + { + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/OperatorType.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/OperatorType.cs new file mode 100644 index 0000000000..2899b5d76e --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/OperatorType.cs @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; +using System; + +namespace Org.Apache.REEF.Network.Elastic.Operators +{ + /// + /// Constants labeling the set of available operators. + /// + [Unstable("0.16", "Constants may change")] + public enum OperatorType : int + { + Empty = 0, + Broadcast = 1, + Reduce = 2, + AggregationRing = 3, + Iterate = 4, + Scatter = 5, + Gather = 6 + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultBroadcast.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultBroadcast.cs new file mode 100644 index 0000000000..f0e2ca22a1 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultBroadcast.cs @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Topology.Physical.Default; +using Org.Apache.REEF.Network.Elastic.Config; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Network.Elastic.Operators.Physical.Enum; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical.Default +{ + /// + /// Default implementation of a group communication operator used to broadcast messages. + /// + /// The type of message being sent. + [Unstable("0.16", "API may change")] + public sealed class DefaultBroadcast : DefaultOneToN, IElasticBroadcast + { + /// + /// Creates a new Broadcast operator. + /// + /// The operator identifier + /// The operator topology layer + [Inject] + private DefaultBroadcast( + [Parameter(typeof(OperatorParameters.OperatorId))] int id, + [Parameter(typeof(OperatorParameters.IsLast))] bool isLast, + DefaultBroadcastTopology topology) : base(id, isLast, topology) + { + OperatorType = OperatorType.Broadcast; + } + + /// + /// Send the data to all child receivers. + /// Send is asynchronous but works in 3 phases: + /// 1-The task asks the driver for updates to the topology + /// 2-Updates are received and added to the local topology + /// --(Note that altough the method is non-blocking, no message will be sent until + /// updates are not received) + /// 3-Send the message. + /// + /// The data to send + public void Send(T data) + { + _topology.TopologyUpdateRequest(); + + _position = PositionTracker.InSend; + + int iteration = (int)(IteratorReference?.Current ?? 0); + var message = _topology.GetDataMessage(iteration, data); + + _topology.Send(message, CancellationSource); + + _position = PositionTracker.AfterSend; + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs new file mode 100644 index 0000000000..4c5782a2fc --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs @@ -0,0 +1,187 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System.Threading; +using System.Collections.Generic; +using Org.Apache.REEF.Network.Elastic.Topology.Physical.Default; +using System; +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Network.Elastic.Operators.Physical.Enum; +using Org.Apache.REEF.Network.Elastic.Comm; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical.Default +{ + /// + /// Generic implementation of a group communication operator where one node sends to N. + /// + /// The type of message being sent. + [Unstable("0.16", "API may change")] + public abstract class DefaultOneToN : IDisposable, IReschedulable + { + private static readonly Logger Log = Logger.GetLogger(typeof(DefaultOneToN<>)); + + internal readonly OneToNTopology _topology; + internal volatile PositionTracker _position = PositionTracker.Nil; + + private readonly bool _isLast; + private bool _cleanDisposal = false; + + /// + /// Creates a new one to N operator. + /// + /// The operator identifier + /// The checkpoint level for the operator + /// Whether this operator is the last in the pipeline + /// The operator topology layer + internal DefaultOneToN(int id, bool isLast, OneToNTopology topology) + { + OperatorId = id; + _isLast = isLast; + _topology = topology; + } + + /// + /// The operator identifier. + /// + public int OperatorId { get; } + + /// + /// The operator type. + /// + public OperatorType OperatorType { get; protected set; } + + /// + /// Operator-specific information that is sent to the driver in case of failure. + /// + public string FailureInfo + { + get + { + return $"{IteratorReference?.Current ?? -1:d}:{_position:d}:{_topology.IsSending:d}"; + } + } + + /// + /// Get a reference of the iterator in the pipeline (if it exists). + /// + public IElasticIterator IteratorReference { protected get; set; } + + /// + /// Cancellation source for stopping the exeuction of the opearator. + /// + public CancellationTokenSource CancellationSource { get; set; } + + /// + /// Action to execute when a task is re-scheduled. + /// + public Action OnTaskRescheduled() + { + return _topology.JoinTopology; + } + + /// + /// The set of messages checkpointed in memory. + /// + private List CheckpointedMessages { get; set; } + + /// + /// Receive a message from neighbors broadcasters. + /// + /// The incoming data + public T Receive() + { + _position = PositionTracker.InReceive; + + var received = false; + ITypedDataMessage typedDataMessage = null; + var isIterative = IteratorReference != null; + + while (!received && !CancellationSource.IsCancellationRequested) + { + typedDataMessage = (ITypedDataMessage)_topology.Receive(CancellationSource); + + if (isIterative && typedDataMessage.Iteration < (int)IteratorReference.Current) + { + Log.Log(Level.Warning, "Received message for iteration {0} but I am already in iteration " + + "{1}: ignoring.", typedDataMessage.Iteration, (int)IteratorReference.Current); + } + else + { + received = true; + } + } + + if (typedDataMessage == null) + { + throw new OperationCanceledException("Impossible to receive messages: operation cancelled."); + } + + if (isIterative) + { + IteratorReference.SyncIteration(typedDataMessage.Iteration); + } + + _position = PositionTracker.AfterReceive; + + return typedDataMessage.Data; + } + + /// + /// Reset the internal position tracker. This should be called + /// every time a new iteration start in the workflow. + /// + public void Reset() + { + _position = PositionTracker.Nil; + } + + /// + /// Initializes the communication group. + /// Computation blocks until all required tasks are registered in the group. + /// + /// + public void WaitForTaskRegistration(CancellationTokenSource cancellationSource) + { + Log.Log(Level.Info, "Waiting for task registration for {0} operator.", OperatorType); + _topology.WaitForTaskRegistration(cancellationSource); + } + + /// + /// Wait until computation is globally completed for this operator + /// before disposing the object. + /// + public void WaitCompletionBeforeDisposing() + { + _topology.WaitCompletionBeforeDisposing(CancellationSource); + _cleanDisposal = true; + } + + /// + /// Dispose the operator. + /// + public void Dispose() + { + if (_isLast && _cleanDisposal) + { + _topology.StageComplete(); + } + _topology.Dispose(); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Enum/PositionTracker.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Enum/PositionTracker.cs new file mode 100644 index 0000000000..6f0ab159f9 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Enum/PositionTracker.cs @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical.Enum +{ + /// + /// Enum summarizing the positions in which the exeuction is within an operator. + /// This information is used in case of failure to properly reconfigure computation. + /// + [Unstable("0.16", "API may change")] + public enum PositionTracker : int + { + Nil = 0, + + InSend = 1, + + InReceive = 2, + + AfterReceiveBeforeSend = 3, + + AfterReceive = 4, + + AfterSendBeforeReceive = 5, + + AfterSend = 6 + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticBroadcast.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticBroadcast.cs new file mode 100644 index 0000000000..6bc4a6da44 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticBroadcast.cs @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical +{ + /// + /// Group communication operator used to broadcast messages. + /// + [Unstable("0.16", "API may change")] + public interface IElasticBroadcast : IElasticTypedOperator, IReceiver, ISender + { + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticIterator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticIterator.cs new file mode 100644 index 0000000000..09a315974e --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticIterator.cs @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; +using System; +using System.Collections; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical +{ + /// + /// Group communication operator used to for iterations. + /// + [Unstable("0.16", "API may change")] + public interface IElasticIterator : IElasticOperator, IEnumerator + { + /// + /// Synchronize the current iteration with the input one. + /// + /// The state in which the iterator will be moved + void SyncIteration(int iteration); + + /// + /// Register the action to trigger when a task is rescheduled. + /// + /// Some code to execute upon task rescheduling + void RegisterActionOnTaskRescheduled(Action action); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticOperator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticOperator.cs new file mode 100644 index 0000000000..55f06c0a3a --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticOperator.cs @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Utilities.Attributes; +using System; +using System.Threading; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical +{ + /// + /// Base class for task-side, physical, group communication operators. + /// + [Unstable("0.16", "API may change")] + public interface IElasticOperator : IWaitForTaskRegistration, IReschedulable, IDisposable + { + /// + /// The operator type. + /// + OperatorType OperatorType { get; } + + /// + /// The operator identifier. + /// + int OperatorId { get; } + + /// + /// Operator specific information in case of failure. + /// + string FailureInfo { get; } + + /// + /// Get a reference of the iterator in the pipeline (if it exists). + /// + IElasticIterator IteratorReference { set; } + + /// + /// Cancellation source for stopping the exeuction of the opearator. + /// + CancellationTokenSource CancellationSource { get; set; } + + /// + /// Wait until computation is globally completed for this operator + /// before disposing the object. + /// + void WaitCompletionBeforeDisposing(); + + /// + /// Reset the internal position tracker. This should be called + /// every time a new iteration start in the workflow. + /// + void Reset(); + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticTypedOperator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticTypedOperator.cs new file mode 100644 index 0000000000..d4e3b740eb --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticTypedOperator.cs @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical +{ + /// + /// Typed physical group communication operator. + /// + /// The type of data managed by the operator + [Unstable("0.16", "API may change")] + public interface IElasticTypedOperator : IElasticOperator + { + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IReceiver.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IReceiver.cs new file mode 100644 index 0000000000..5dc4c18856 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IReceiver.cs @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical +{ + /// + /// Group Communication Operator receiving messages. + /// + /// The type of data being receive. + [Unstable("0.16", "API may change")] + public interface IReceiver + { + /// + /// Receive a message from a sender task. + /// + /// The incoming message + T Receive(); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IReschedulable.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IReschedulable.cs new file mode 100644 index 0000000000..30440984c8 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IReschedulable.cs @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; +using System; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical +{ + /// + /// Interface used when an action needs to triggered after a rescheduling event. + /// + [Unstable("0.16", "API may change")] + public interface IReschedulable + { + /// + /// Action to execute when a task is re-scheduled. + /// + Action OnTaskRescheduled(); + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/ISender.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/ISender.cs new file mode 100644 index 0000000000..b341ca6fef --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/ISender.cs @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical +{ + /// + /// Group communication operator used to send messages to child tasks. + /// + /// The data type of the message + [Unstable("0.16", "API may change")] + public interface ISender + { + /// + /// Send the data to all child receivers. + /// + /// The data to send + void Send(T data); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CancellationSource.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CancellationSource.cs new file mode 100644 index 0000000000..4654a3a18d --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CancellationSource.cs @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Utilities.Attributes; +using System; +using System.Threading; + +namespace Org.Apache.REEF.Network.Elastic.Task +{ + /// + /// Generic cancellation source for task operations. + /// This class basically wraps and uses Tang + /// to inject the same source through the elastic communication services. + /// + [Unstable("0.16", "API may change")] + public sealed class CancellationSource : IDisposable + { + [Inject] + private CancellationSource() + { + } + + /// + /// The wrapped cancellation source. + /// + public readonly CancellationTokenSource Source = new CancellationTokenSource(); + + /// + /// Whether the operation is cancelled. + /// + /// + public bool IsCancelled + { + get { return Source.IsCancellationRequested; } + } + + /// + /// Cancel the currently running computation. + /// + public void Cancel() + { + Source.Cancel(); + } + + public void Dispose() + { + Source.Dispose(); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs new file mode 100644 index 0000000000..1ef2a3d2f5 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs @@ -0,0 +1,283 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Linq; +using Org.Apache.REEF.Network.NetworkService; +using Org.Apache.REEF.Wake.Remote; +using Org.Apache.REEF.Wake; +using Org.Apache.REEF.Utilities.Logging; +using System.Threading; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Network.Elastic.Topology.Physical; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Task.Impl +{ + /// + /// Handles all incoming / outcoming messages for a given task. + /// + [Unstable("0.16", "API may change")] + internal abstract class CommunicationLayer : + IObserver>>, + IDisposable + { + private static readonly Logger Log = Logger.GetLogger(typeof(CommunicationLayer)); + + private readonly int _timeout; + private readonly int _retryRegistration; + private readonly int _retrySending; + private readonly int _sleepTime; + private readonly StreamingNetworkService _networkService; + protected readonly DefaultTaskToDriverMessageDispatcher _taskToDriverDispatcher; + private readonly ElasticDriverMessageHandler _driverMessagesHandler; + private readonly IIdentifierFactory _idFactory; + private readonly IDisposable _communicationObserver; + private readonly ConcurrentDictionary _driverMessageObservers; + + protected bool _disposed = false; + + protected readonly ConcurrentDictionary _groupMessageObservers = + new ConcurrentDictionary(); + + /// + /// Creates a new communication layer. + /// + protected CommunicationLayer( + int timeout, + int retryRegistration, + int sleepTime, + int retrySending, + StreamingNetworkService networkService, + DefaultTaskToDriverMessageDispatcher taskToDriverDispatcher, + ElasticDriverMessageHandler driverMessagesHandler, + IIdentifierFactory idFactory) + { + _timeout = timeout; + _retryRegistration = retryRegistration; + _sleepTime = sleepTime; + _retrySending = retrySending; + _networkService = networkService; + _taskToDriverDispatcher = taskToDriverDispatcher; + _driverMessagesHandler = driverMessagesHandler; + _idFactory = idFactory; + + _communicationObserver = _networkService.RemoteManager.RegisterObserver(this); + _driverMessageObservers = _driverMessagesHandler.DriverMessageObservers; + } + + /// + /// Registers a with the communication layer. + /// + /// The observer of the communicating topology operator + public void RegisterOperatorTopologyForTask(IOperatorTopologyWithCommunication operatorObserver) + { + if (!_groupMessageObservers.TryAdd(operatorObserver.NodeId(), operatorObserver)) + { + throw new IllegalStateException($"Topology for id {operatorObserver.NodeId()} already added among listeners."); + } + } + + /// + /// Registers a with the communication layer. + /// + /// The observer of the driver aware topology + internal void RegisterOperatorTopologyForDriver(DriverAwareOperatorTopology operatorObserver) + { + if (!_driverMessageObservers.TryAdd(operatorObserver.NodeId(), operatorObserver)) + { + throw new IllegalStateException($"Topology for id {operatorObserver.NodeId()} already added among driver listeners."); + } + } + + /// + /// Send the communication message to the task whose name is included in the message. + /// + /// The destination node for the message + /// The message to send + /// The token to cancel the operation + internal void Send( + string destination, + ElasticGroupCommunicationMessage message, + CancellationTokenSource cancellationSource) + { + if (message == null) + { + throw new ArgumentNullException(nameof(message)); + } + if (string.IsNullOrEmpty(destination)) + { + throw new ArgumentNullException(nameof(destination)); + } + if (_disposed) + { + Log.Log(Level.Warning, "Received send message request after disposing: Ignoring."); + return; + } + + IIdentifier destId = _idFactory.Create(destination); + + for (int retry = 0; !Send(destId, message); retry++) + { + if (retry > _retrySending) + { + throw new IllegalStateException($"Unable to send message after retrying {retry} times."); + } + Thread.Sleep(_timeout); + } + } + + /// + /// Forward the received message to the target . + /// + /// The received message + public abstract void OnNext(IRemoteMessage> remoteMessage); + + /// + /// Checks if the identifier is registered with the name server. + /// Throws exception if the operation fails more than the retry count. + /// + /// The identifier to look up + /// The token to cancel the operation + /// Nodes that got removed during task registration + public void WaitForTaskRegistration( + IEnumerable identifiers, + CancellationTokenSource cancellationSource, + IDictionary removed = null) + { + ISet foundSet = new HashSet(); + var count = identifiers.Count(); + + for (var i = 0; i < _retryRegistration; i++) + { + if ((cancellationSource?.Token.IsCancellationRequested) ?? false) + { + Log.Log(Level.Warning, "WaitForTaskRegistration is canceled in retryCount {0}.", i); + throw new OperationCanceledException("WaitForTaskRegistration is canceled"); + } + + Log.Log(Level.Info, "In retryCount {0}.", i); + foreach (var identifier in identifiers.Except(foundSet)) + { + if (removed?.ContainsKey(identifier) ?? false) + { + foundSet.Add(identifier); + Log.Log(Level.Verbose, + "Dependent id {0} was removed at loop {1}.", identifier, i); + } + else if (Lookup(identifier)) + { + foundSet.Add(identifier); + Log.Log(Level.Verbose, + "Find a dependent id {0} at loop {1}.", identifier, i); + } + } + + if (foundSet.Count >= count) + { + Log.Log(Level.Info, + "Found all {0} dependent ids at loop {1}.", foundSet.Count, i); + return; + } + + Thread.Sleep(_sleepTime); + } + + var msg = string.Join(",", identifiers.Except(foundSet)); + + Log.Log(Level.Error, "Cannot find registered parent/children: {0}.", msg); + throw new Exception("Failed to find parent/children nodes"); + } + + /// + /// Look up an identifier with the name server. + /// + /// The identifier to look up + /// + public bool Lookup(string identifier) + { + return !_disposed && _networkService?.NamingClient.Lookup(identifier) != null; + } + + /// + /// Remove the connection to the target destination. + /// + /// The node to remove the connection + public void RemoveConnection(string destination) + { + IIdentifier destId = _idFactory.Create(destination); + _networkService.RemoveConnection(destId); + } + + public void OnError(Exception error) + { + } + + public void OnCompleted() + { + foreach (var observer in _groupMessageObservers.Values) + { + observer.OnCompleted(); + } + } + + /// + /// Dispose the connection layer. + /// + public void Dispose() + { + if (!_disposed) + { + OnCompleted(); + + _groupMessageObservers.Clear(); + + _communicationObserver.Dispose(); + + _disposed = true; + + Log.Log(Level.Info, "Communication layer disposed."); + } + } + + private bool Send(IIdentifier destId, ElasticGroupCommunicationMessage message) + { + var connection = _networkService.NewConnection(destId); + + try + { + if (!connection.IsOpen) + { + connection.Open(); + } + + connection.Write(message); + } + catch (Exception e) + { + Log.Log(Level.Warning, "Unable to send message to " + destId, e.Message); + connection.Dispose(); + return false; + } + + return true; + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs new file mode 100644 index 0000000000..f9b2635941 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System.Collections.Generic; +using Org.Apache.REEF.Network.NetworkService; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Wake.Remote; +using Org.Apache.REEF.Wake; +using Org.Apache.REEF.Network.Elastic.Config; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Network.Elastic.Topology.Physical; +using Org.Apache.REEF.Utilities.Attributes; +using static Org.Apache.REEF.Network.Elastic.Config.GroupCommunicationConfigurationOptions; + +namespace Org.Apache.REEF.Network.Elastic.Task.Impl +{ + /// + /// Implementation of the communication layer with default task to driver messages. + /// + [Unstable("0.16", "API may change")] + internal sealed class DefaultCommunicationLayer : + CommunicationLayer, + IDefaultTaskToDriverMessages + { + private static readonly Logger Log = Logger.GetLogger(typeof(DefaultCommunicationLayer)); + + /// + /// Creates a new communication layer. + /// + [Inject] + private DefaultCommunicationLayer( + [Parameter(typeof(Timeout))] int timeout, + [Parameter(typeof(RetryCountWaitingForRegistration))] int retryRegistration, + [Parameter(typeof(SleepTimeWaitingForRegistration))] int sleepTime, + [Parameter(typeof(ElasticServiceConfigurationOptions.SendRetry))] int retrySending, + StreamingNetworkService networkService, + DefaultTaskToDriverMessageDispatcher taskToDriverDispatcher, + ElasticDriverMessageHandler driverMessagesHandler, + IIdentifierFactory idFactory) : base( + timeout, + retryRegistration, + sleepTime, + retrySending, + networkService, + taskToDriverDispatcher, + driverMessagesHandler, + idFactory) + { + } + + /// + /// Forward the received message to the target . + /// + /// The received message + public override void OnNext(IRemoteMessage> remoteMessage) + { + if (_disposed) + { + Log.Log(Level.Warning, "Received message after disposing: Ignoring."); + return; + } + + var nsMessage = remoteMessage.Message; + var nodeId = nsMessage.Data.NodeId(); + + if (!_groupMessageObservers.TryGetValue(nodeId, out IOperatorTopologyWithCommunication operatorObserver)) + { + throw new KeyNotFoundException($"Unable to find registered operator topology for {nodeId}"); + } + + operatorObserver.OnNext(nsMessage); + } + + /// + /// Notify the driver that operator is ready to join the + /// group communication topology. + /// + /// The current task + /// The identifier of the operator ready to join the topology + public void JoinTopology(string taskId, string stageName, int operatorId) + { + _taskToDriverDispatcher.JoinTopology(taskId, stageName, operatorId); + } + + /// + /// Send a notification to the driver for an update on topology state. + /// + /// The current task id + /// The operator requiring the topology update + public void TopologyUpdateRequest(string taskId, string stageName, int operatorId) + { + _taskToDriverDispatcher.TopologyUpdateRequest(taskId, stageName, operatorId); + } + + /// + /// Signal the driver that the current stage is completed. + /// + /// The current task identifier + public void StageComplete(string taskId, string stageName) + { + _taskToDriverDispatcher.StageComplete(taskId, stageName); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticContext.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticContext.cs new file mode 100644 index 0000000000..6110710b59 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticContext.cs @@ -0,0 +1,147 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Collections.Generic; +using System.Threading; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.NetworkService; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Formats; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Wake.Remote.Impl; +using Org.Apache.REEF.Network.Elastic.Config; +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Common.Tasks.Events; +using Org.Apache.REEF.Utilities.Attributes; +using System.Linq; + +namespace Org.Apache.REEF.Network.Elastic.Task.Impl +{ + /// + /// Default implementation of the task-side context. + /// Used by REEF tasks to initialize group communication and fetch stages. + /// + [Unstable("0.16", "API may change")] + internal sealed class DefaultElasticContext : IElasticContext + { + private readonly Dictionary _stages = new Dictionary(); + private readonly string _taskId; + + private readonly INetworkService _networkService; + + private readonly object _disposeLock = new object(); + private bool _disposed = false; + + /// + /// Creates a new elastic context and registers the task id with the Name Server. + /// + /// The set of serialized stages configurations + /// The identifier for this task + /// The writable network service used to send messages + /// Used to deserialize service configuration + /// Dependency injector + [Inject] + public DefaultElasticContext( + [Parameter(typeof(ElasticServiceConfigurationOptions.SerializedStageConfigs))] ISet stageConfigs, + [Parameter(typeof(TaskConfigurationOptions.Identifier))] string taskId, + StreamingNetworkService networkService, + AvroConfigurationSerializer configSerializer, + DefaultTaskToDriverMessageDispatcher taskToDriverDispatcher, // Otherwise the correct instance does not propagate through + ElasticDriverMessageHandler driverMessageHandler, + IInjector injector) + { + _networkService = networkService; + _taskId = taskId; + + _stages = stageConfigs + .Select(config => + injector.ForkInjector(configSerializer.FromString(config)).GetInstance()) + .ToDictionary(stage => stage.StageName, stage => stage); + + _networkService.Register(new StringIdentifier(_taskId)); + } + + /// + /// This is to ensure all the nodes in the groups are registered before starting communications. + /// + /// The token used to signal if the operation got cancelled + public void WaitForTaskRegistration(CancellationTokenSource cancellationSource = null) + { + foreach (var stage in _stages.Values) + { + stage.WaitForTaskRegistration(cancellationSource); + } + } + + /// + /// Gets the stage object for the given stage name. + /// + /// The name of the stage + /// The task-side stage object + public IElasticStage GetStage(string stageName) + { + if (!_stages.TryGetValue(stageName, out IElasticStage stage)) + { + return stage; + } + + throw new ArgumentException($"No stage with name: {stageName}."); + } + + /// + /// Disposes the services. + /// + public void Dispose() + { + lock (_disposeLock) + { + if (!_disposed) + { + foreach (var sub in _stages.Values) + { + sub.Dispose(); + } + + _networkService.Unregister(); + + _disposed = true; + } + } + } + + /// + /// Action to trigger in case a is received. + /// + /// The close event + public void OnNext(ICloseEvent value) + { + foreach (var stage in _stages.Values) + { + stage.Cancel(); + } + } + + public void OnError(Exception error) + { + } + + public void OnCompleted() + { + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs new file mode 100644 index 0000000000..5cce608067 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Threading; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Config; +using System.Collections.Generic; +using Org.Apache.REEF.Tang.Formats; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.Elastic.Task.Impl; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Task +{ + /// + /// Default implementation of the task-side stage. + /// + [Unstable("0.16", "API may change")] + internal sealed class DefaultElasticStage : IElasticStage + { + private static readonly Logger Log = Logger.GetLogger(typeof(DefaultElasticStage)); + + private readonly CancellationSource _cancellationSource; + + private readonly object _disposeLock = new object(); + private bool _disposed = false; + + /// + /// Injectable constructor. + /// + [Inject] + private DefaultElasticStage( + [Parameter(typeof(OperatorParameters.StageName))] string stageName, + [Parameter(typeof(OperatorParameters.SerializedOperatorConfigs))] IList operatorConfigs, + [Parameter(typeof(OperatorParameters.StartIteration))] int startIteration, + AvroConfigurationSerializer configSerializer, + Workflow workflow, + DefaultCommunicationLayer commLayer, + CancellationSource cancellationSource, + IInjector injector) + { + StageName = stageName; + Workflow = workflow; + + _cancellationSource = cancellationSource; + + foreach (string operatorConfigStr in operatorConfigs) + { + IConfiguration operatorConfig = configSerializer.FromString(operatorConfigStr); + IInjector operatorInjector = injector.ForkInjector(operatorConfig); + string msgType = operatorInjector.GetNamedInstance(); + Type groupCommOperatorGenericInterface = typeof(IElasticTypedOperator<>); + Type groupCommOperatorInterface = + groupCommOperatorGenericInterface.MakeGenericType(Type.GetType(msgType)); + var operatorObj = operatorInjector.GetInstance(groupCommOperatorInterface); + + Workflow.Add((IElasticOperator)operatorObj); + } + } + + /// + /// The stage name. + /// + public string StageName { get; } + + /// + /// The workflow of the stage. + /// + public Workflow Workflow { get; } + + /// + /// Initializes the communication group. + /// Computation blocks until all required tasks are registered in the group. + /// + /// The signal to cancel the operation + public void WaitForTaskRegistration(CancellationTokenSource cancellationSource = null) + { + try + { + Workflow.WaitForTaskRegistration(cancellationSource ?? _cancellationSource.Source); + } + catch (OperationCanceledException e) + { + Log.Log(Level.Error, "Stage " + StageName + " failed during registration.", e); + throw e; + } + } + + /// + /// Dispose the stage. + /// + public void Dispose() + { + lock (_disposeLock) + { + if (!_disposed) + { + Workflow?.Dispose(); + + _disposed = true; + } + } + } + + /// + /// Cancel the execution of stage. + /// + public void Cancel() + { + if (!_cancellationSource.IsCancelled) + { + _cancellationSource.Cancel(); + + Log.Log(Level.Info, "Received request to close stage {0}", StageName); + } + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticTask.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticTask.cs new file mode 100644 index 0000000000..fc05974722 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticTask.cs @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Common.Tasks.Events; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Task.Default +{ + /// + /// Default implementation of a task using the elastic group communication service. + /// + [Unstable("0.16", "API may change")] + public abstract class DefaultElasticTask : ITask, IObserver + { + private readonly IElasticContext _context; + private readonly IElasticStage _stage; + + private readonly CancellationSource _cancellationSource; + + /// + /// Constructor for the default task implementation using the elastic group communication service. + /// + /// A cancellation source + /// The elastic context + /// The name of the stage to execute + public DefaultElasticTask( + CancellationSource source, + IElasticContext context, + string stageName) + { + _context = context; + _cancellationSource = source; + + _stage = _context.GetStage(stageName); + } + + /// + /// Implementation of the Call method of . + /// + /// + /// + public byte[] Call(byte[] memento) + { + _context.WaitForTaskRegistration(_cancellationSource.Source); + + using (var workflow = _stage.Workflow) + { + try + { + Execute(memento, workflow); + } + catch (Exception e) + { + workflow.Throw(e); + } + } + + return null; + } + + /// + /// Default implementation of the interface. + /// + public void Dispose() + { + _cancellationSource.Cancel(); + _context.Dispose(); + } + + public void OnNext(ICloseEvent value) + { + _stage.Cancel(); + } + + public void OnError(Exception error) + { + } + + public void OnCompleted() + { + } + + /// + /// Method wrapping the actual task logic. + /// Whatever exception happen inside this method call is managed by + /// the elastic framework. + /// + /// The memento object inherited from the Call method + /// The workflow object managing the sequence of operation to execute + protected abstract void Execute(byte[] memento, Workflow workflow); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultTaskToDriverMessageDispatcher.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultTaskToDriverMessageDispatcher.cs new file mode 100644 index 0000000000..1793ef01a1 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultTaskToDriverMessageDispatcher.cs @@ -0,0 +1,120 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Annotations; +using System; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.Elastic.Comm.Enum; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Common.Runtime.Evaluator; + +namespace Org.Apache.REEF.Network.Elastic.Task.Impl +{ + /// + /// Implemention of with default + /// messages dispatcher. + /// + [Unstable("0.16", "API may change")] + internal sealed class DefaultTaskToDriverMessageDispatcher : + TaskToDriverMessageDispatcher, + IDefaultTaskToDriverMessages + { + private static readonly Logger Log = Logger.GetLogger(typeof(DefaultTaskToDriverMessageDispatcher)); + + /// + /// Injectable constrcutor. + /// + /// + [Inject] + private DefaultTaskToDriverMessageDispatcher(IHeartBeatManager heartBeatManager) : base(heartBeatManager) + { + } + + /// + /// Notify the driver that operator is ready to join the + /// group communication topology. + /// + /// The current task + /// The identifier of the operator ready to join the topology + public void JoinTopology(string taskId, string stageName, int operatorId) + { + int offset = 0; + byte[] message = new byte[sizeof(ushort) + stageName.Length + sizeof(ushort) + sizeof(ushort)]; + Buffer.BlockCopy(BitConverter.GetBytes(stageName.Length), 0, message, offset, sizeof(ushort)); + offset += sizeof(ushort); + Buffer.BlockCopy(ByteUtilities.StringToByteArrays(stageName), 0, message, offset, stageName.Length); + offset += stageName.Length; + Buffer.BlockCopy( + BitConverter.GetBytes((ushort)TaskMessageType.JoinTopology), 0, message, offset, sizeof(ushort)); + offset += sizeof(ushort); + Buffer.BlockCopy(BitConverter.GetBytes((ushort)operatorId), 0, message, offset, sizeof(ushort)); + + Log.Log(Level.Info, "Operator {0} requesting to join the topology through heartbeat.", operatorId); + + Send(taskId, message); + } + + /// + /// Send a notification to the driver for an update on topology state. + /// + /// The current task id + /// The operator requiring the topology update + public void TopologyUpdateRequest(string taskId, string stageName, int operatorId) + { + int offset = 0; + byte[] message = new byte[sizeof(ushort) + stageName.Length + sizeof(ushort) + sizeof(ushort)]; + Buffer.BlockCopy(BitConverter.GetBytes(stageName.Length), 0, message, offset, sizeof(ushort)); + offset += sizeof(ushort); + Buffer.BlockCopy(ByteUtilities.StringToByteArrays(stageName), 0, message, offset, stageName.Length); + offset += stageName.Length; + Buffer.BlockCopy( + BitConverter.GetBytes((ushort)TaskMessageType.TopologyUpdateRequest), + 0, + message, + offset, + sizeof(ushort)); + offset += sizeof(ushort); + Buffer.BlockCopy(BitConverter.GetBytes((ushort)operatorId), 0, message, offset, sizeof(ushort)); + + Log.Log(Level.Info, "Operator {0} requesting a topology update through heartbeat.", operatorId); + + Send(taskId, message); + } + + /// + /// Signal the driver that the current stage is completed. + /// + /// The current task identifier + public void StageComplete(string taskId, string stageName) + { + int offset = 0; + byte[] message = new byte[sizeof(ushort) + stageName.Length + sizeof(ushort)]; + Buffer.BlockCopy(BitConverter.GetBytes(stageName.Length), 0, message, offset, sizeof(ushort)); + offset += sizeof(ushort); + Buffer.BlockCopy(ByteUtilities.StringToByteArrays(stageName), 0, message, offset, stageName.Length); + offset += stageName.Length; + Buffer.BlockCopy( + BitConverter.GetBytes((ushort)TaskMessageType.CompleteStage), 0, message, offset, sizeof(ushort)); + + Log.Log(Level.Info, "Sending notification that the stage is completed."); + + Send(taskId, message); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/IDefaultTaskToDrivermessages.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/IDefaultTaskToDrivermessages.cs new file mode 100644 index 0000000000..6c1d67b862 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/IDefaultTaskToDrivermessages.cs @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Task +{ + /// + /// Interface defining the messages supported in tasks to driver communications. + /// + [Unstable("0.16", "API may change")] + internal interface IDefaultTaskToDriverMessages + { + /// + /// Notify the driver that operator is ready to join the + /// group communication topology. + /// + /// The current task + /// The name of the stage + /// The identifier of the operator ready to join the topology + void JoinTopology(string taskId, string stageName, int operatorId); + + /// + /// Send a notification to the driver for an update on topology state. + /// + /// The current task id + /// The name of the stage + /// The operator requiring the topology update + void TopologyUpdateRequest(string taskId, string stageName, int operatorId); + + /// + /// Signal the driver that the current stage is completed. + /// + /// The current task identifier + /// The name of the stage + void StageComplete(string taskId, string stageName); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs new file mode 100644 index 0000000000..99977b78a9 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Common.Tasks.Events; +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Network.Elastic.Topology.Physical; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Concurrent; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Task.Impl +{ + /// + /// Handler for incoming messages from the driver. + /// + [Unstable("0.16", "API may change")] + internal sealed class ElasticDriverMessageHandler : IDriverMessageHandler + { + /// + /// Injectable constructor. + /// + [Inject] + private ElasticDriverMessageHandler() + { + } + + /// + /// Observers of incoming messages from the driver. + /// + internal readonly ConcurrentDictionary DriverMessageObservers = + new ConcurrentDictionary(); + + /// + /// Handle an incoming message. + /// + /// The message from the driver + public void Handle(IDriverMessage message) + { + + if (!message.Message.IsPresent()) + { + throw new IllegalStateException("Received message with no payload."); + } + + var edm = ElasticDriverMessageImpl.From(message.Message.Value).Message; + + if (!DriverMessageObservers.TryGetValue(edm.NodeId(), out DriverAwareOperatorTopology operatorObserver)) + { + throw new KeyNotFoundException("Unable to find registered operator topology for " + edm.NodeId()); + } + + operatorObserver.OnNext(edm); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticContext.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticContext.cs new file mode 100644 index 0000000000..ac5c908ae4 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticContext.cs @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Task.Impl; +using Org.Apache.REEF.Common.Tasks.Events; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Task +{ + /// + /// Used by REEF tasks to initialize group communication and fetch Stages. + /// + [Unstable("0.16", "API may change")] + [DefaultImplementation(typeof(DefaultElasticContext))] + public interface IElasticContext : + IWaitForTaskRegistration, + IDisposable, + IObserver + { + /// + /// Gets the stage with the given name. + /// + /// The name of the stage + /// The task-side configured stage + IElasticStage GetStage(string stageName); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticStage.cs new file mode 100644 index 0000000000..74b5fac709 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticStage.cs @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Utilities.Attributes; +using System; + +namespace Org.Apache.REEF.Network.Elastic.Task +{ + /// + /// Used by tasks to fetch the workflow of the stages configured in the driver. + /// + [Unstable("0.16", "API may change")] + [DefaultImplementation(typeof(DefaultElasticStage))] + public interface IElasticStage : IWaitForTaskRegistration, IDisposable + { + /// + /// The name of the stage. + /// + string StageName { get; } + + /// + /// Cacnel the execution of the stage. + /// + void Cancel(); + + /// + /// The workflow of operators. + /// + Workflow Workflow { get; } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/INodeIdentifier.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/INodeIdentifier.cs new file mode 100644 index 0000000000..79c03a20a3 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/INodeIdentifier.cs @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Task +{ + /// + /// Interface used by the elastic group communication framework to manage node identifiers. + /// + [Unstable("0.16", "API may change")] + public interface INodeIdentifier + { + /// + /// The stage name. + /// + string StageName { get; } + + /// + /// The operator name. + /// + int OperatorId { get; } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IWaitForTaskRegistration.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IWaitForTaskRegistration.cs new file mode 100644 index 0000000000..f5d9f8d537 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IWaitForTaskRegistration.cs @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; +using System.Threading; + +namespace Org.Apache.REEF.Network.Elastic.Task +{ + /// + /// Extended by classes requiring the initialization of group communication. + /// + [Unstable("0.16", "API may change")] + public interface IWaitForTaskRegistration + { + /// + /// Initializes the communication group. + /// Computation blocks until all required tasks are registered in the group. + /// + /// The signal to cancel the operation + void WaitForTaskRegistration(CancellationTokenSource cancellationSource = null); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeIdentifier.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeIdentifier.cs new file mode 100644 index 0000000000..33ed8f1d42 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeIdentifier.cs @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Task +{ + /// + /// An identifier for a given node in the group communication topology. + /// A node is uniquely identifiable by a combination of its + /// , and . + /// + [Unstable("0.16", "API may change")] + public struct NodeIdentifier : INodeIdentifier + { + /// + /// The stage name. + /// + public string StageName { get; } + + /// + /// The operator name. + /// + public int OperatorId { get; } + + /// + /// Constructor. + /// + /// + /// + public NodeIdentifier(string stageName, int operatorId) + { + StageName = stageName; + OperatorId = operatorId; + } + + public override string ToString() + { + return $"{StageName}-{OperatorId}"; + } + } + + public static class NodeIdentifierExtensions + { + public static NodeIdentifier NodeId(this INodeIdentifier id) + { + return new NodeIdentifier(id.StageName, id.OperatorId); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/TaskToDriverMessageDispatcher.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/TaskToDriverMessageDispatcher.cs new file mode 100644 index 0000000000..5e46874fa5 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/TaskToDriverMessageDispatcher.cs @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Common.Runtime.Evaluator; +using Org.Apache.REEF.Common.Protobuf.ReefProtocol; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Utilities.Attributes; +using static Org.Apache.REEF.Common.Protobuf.ReefProtocol.TaskStatusProto; + +namespace Org.Apache.REEF.Network.Elastic.Task.Impl +{ + /// + /// Class used to manage messages going from tasks to the driver. + /// Messages are notifying through the heartbeat. + /// + [Unstable("0.16", "API may change")] + internal abstract class TaskToDriverMessageDispatcher + { + private readonly IHeartBeatManager _heartBeatManager; + + /// + /// Constrcutor. + /// + /// Reference to the heartbeat manager + protected TaskToDriverMessageDispatcher(IHeartBeatManager heartBeatManager) + { + _heartBeatManager = heartBeatManager; + } + + /// + /// Send a serialized message to the driver. + /// + /// The id of the task sending the message + /// The serizlied message to send + protected void Send(string taskId, byte[] message) + { + TaskStatusProto taskStatusProto = new TaskStatusProto() + { + task_id = taskId, + context_id = Utils.GetContextIdFromTaskId(taskId), + task_message = { new TaskMessageProto { source_id = taskId, message = message } } + }; + + Heartbeat(taskStatusProto); + } + + private void Heartbeat(TaskStatusProto proto) + { + var state = _heartBeatManager.ContextManager.GetTaskStatus(); + + if (state.IsPresent()) + { + proto.state = state.Value.state; + } + + _heartBeatManager.OnNext(proto); + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs new file mode 100644 index 0000000000..b0cf4d5f40 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs @@ -0,0 +1,257 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Utilities.Logging; +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using static Org.Apache.REEF.Network.Elastic.Config.GroupCommunicationConfigurationOptions; + +namespace Org.Apache.REEF.Network.Elastic.Task +{ + /// + /// Task-side representation of the the sequence of group communication operations to execute. + /// Exception rised during execution are managed by the framework and recovered through + /// the user-defined policies / mechanisms. + /// + [Unstable("0.16", "API may change")] + public sealed class Workflow : IEnumerator, IEnumerable + { + private static readonly Logger Log = Logger.GetLogger(typeof(Workflow)); + + private int _position = -1; + private bool _failed = false; + private bool _disposed = false; + private readonly List _iteratorsPosition = new List(); + + private readonly object disposeLock = new object(); + private readonly IList _operators = new List(); + private readonly CancellationSource _cancellationSource; + private readonly bool _isRescheduled; + + /// + /// Injectable constructor. + /// + /// + [Inject] + private Workflow( + [Parameter(typeof(IsRescheduled))] bool isRescheduled, + CancellationSource cancellationSource) + { + _cancellationSource = cancellationSource; + _isRescheduled = isRescheduled; + } + + /// + /// Try to move to the next operation in the workflow. + /// + /// + public bool MoveNext() + { + _position++; + + if (_failed || _cancellationSource.IsCancelled) + { + return false; + } + + // Check if we need to iterate + if (_iteratorsPosition.Count > 0 && _position == _iteratorsPosition[0]) + { + IElasticIterator iteratorOperator = _operators[_position] as IElasticIterator; + + if (iteratorOperator.MoveNext()) + { + _position++; + ResetOperatorPositions(); + + return true; + } + + if (_iteratorsPosition.Count > 1) + { + _iteratorsPosition.RemoveAt(0); + _position = _iteratorsPosition[0] - 1; + } + + return false; + } + + // In case we have one or zero iterators + // (or we are at the last iterator when multiple iterators exists) + if (_position >= _operators.Count || + (_iteratorsPosition.Count > 1 && _position == _iteratorsPosition[1])) + { + if (_iteratorsPosition.Count == 0) + { + return false; + } + + _position = _iteratorsPosition[0] - 1; + + return MoveNext(); + } + + if (_isRescheduled) + { + Current.OnTaskRescheduled().Invoke(); + } + + return true; + } + + /// + /// Method used to make the framework aware that an exception as been thrown + /// during execution. + /// + /// The rised exception + public void Throw(Exception e) + { + if (_cancellationSource.IsCancelled) + { + Log.Log(Level.Warning, + "Workflow captured an exception while cancellation source was true.", e); + } + else + { + Log.Log(Level.Error, "Workflow captured an exception.", e); + _failed = true; + + throw new OperatorException( + "Workflow captured an exception", Current.OperatorId, e, Current.FailureInfo); + } + } + + /// + /// Start the execution of the workflow from the first operator / iterator. + /// + public void Reset() + { + _position = _iteratorsPosition.FirstOrDefault(); // default for int is 0 + } + + /// + /// Get the current elastic operator. + /// + public IElasticOperator Current + { + get + { + return _operators[_position < 0 ? 0 : _position]; + } + } + + object IEnumerator.Current + { + get { return Current; } + } + + /// + /// Dispose the workflow. + /// + public void Dispose() + { + lock (disposeLock) + { + if (!_disposed) + { + // Clean dispose, check that the computation is completed + if (!_failed) + { + foreach (var op in _operators) + { + op?.WaitCompletionBeforeDisposing(); + } + } + + foreach (var op in _operators) + { + op?.Dispose(); + } + } + + _disposed = true; + } + } + + /// + /// Add an elastic operator to the workflow. + /// + /// + internal void Add(IElasticOperator op) + { + op.CancellationSource = _cancellationSource.Source; + + _operators.Add(op); + + if (_iteratorsPosition.Count > 0) + { + var iterPos = _iteratorsPosition.Last(); + var iterator = (IElasticIterator)_operators[iterPos]; + + op.IteratorReference = iterator; + iterator.RegisterActionOnTaskRescheduled(op.OnTaskRescheduled()); + } + + if (op.OperatorType == OperatorType.Iterate) + { + _iteratorsPosition.Add(_operators.Count - 1); + } + } + + /// + /// Initializes the communication group. + /// Computation blocks until all required tasks are registered in the group. + /// + /// The signal to cancel the operation + internal void WaitForTaskRegistration(CancellationTokenSource cancellationSource = null) + { + foreach (var op in _operators) + { + op.WaitForTaskRegistration(cancellationSource); + } + } + + /// + /// Reset the position tracker for all operators in the workflow. + /// + private void ResetOperatorPositions() + { + for (int pos = _position; pos < _operators.Count; pos++) + { + _operators[pos].Reset(); + } + } + + public IEnumerator GetEnumerator() + { + return this; + } + + IEnumerator IEnumerable.GetEnumerator() + { + return this; + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Enum/DataNodeState.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Enum/DataNodeState.cs new file mode 100644 index 0000000000..46c74af970 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Enum/DataNodeState.cs @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum +{ + [Unstable("0.16", "Types may change")] + internal enum DataNodeState : int + { + Reachable = 1, + + Unreachable = 2, + + Lost = 3 + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Enum/TopologyType.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Enum/TopologyType.cs new file mode 100644 index 0000000000..3563a883a8 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Enum/TopologyType.cs @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum +{ + /// + /// Enum defining the supported type of (logical) topologies + /// in which networked nodes are organized. + /// + [Unstable("0.16", "Types may change")] + public enum TopologyType + { + Flat = 0, + + Tree = 1, + + Ring = 2 + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/ITopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/ITopology.cs new file mode 100644 index 0000000000..ba74942063 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/ITopology.cs @@ -0,0 +1,120 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Tang.Interface; +using System.Collections.Generic; +using Org.Apache.REEF.Network.Elastic.Failures; +using System; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Logical +{ + /// + /// Represents a topology graph for Elastic Group Communication Operators. + /// + [Unstable("0.16", "API may change")] + public interface ITopology + { + /// + /// The identifier of the operator using the topology. + /// + int OperatorId { get; set; } + + /// + /// The stage of the operator using the topology. + /// + string StageName { get; set; } + + /// + /// Adds a new task to the topology. + /// When called before Build() actually adds the task to the topology. + /// After Build(), it assumes that the task is added because recovered from a failure. + /// A failure machine is given as input so that the topology can update the number of available nodes. + /// + /// The id of the task to be added + /// The failure machine that manage the failure for the operator. + /// True if is the first time the topology sees this task + bool AddTask(string taskId, IFailureStateMachine failureMachine); + + /// + /// Removes a task from the topology. + /// + /// The id of the task to be removed + /// The number of data points lost because of the removed task + int RemoveTask(string taskId); + + /// + /// Whether the topology can be sheduled. + /// + /// True if the topology is ready to be scheduled + bool CanBeScheduled(); + + /// + /// Finalizes the topology. + /// After the topology has been finalized, any task added to the topology is + /// assumed as a task recovered from a failure. + /// + /// The same finalized topology + ITopology Build(); + + /// + /// Adds the topology configuration for the input task to the input builder. + /// Must be called only after all tasks have been added to the topology, i.e., after build. + /// + /// The task id of the task that belongs to this Topology + /// The task configuration + IConfiguration GetTaskConfiguration(int taskId); + + /// + /// Utility method for logging the topology state. + /// This will be called every time a topology object is built or modified + /// because of a failure. + /// + string LogTopologyState(); + + /// + /// This method is triggered when a node contacts the driver to synchronize the remote topology + /// with the driver's one. + /// + /// The identifier of the task asking for the update + /// An optional failure machine to log updates + IEnumerable TopologyUpdateResponse(string taskId, Optional failureStateMachine); + + /// + /// Action to trigger when the operator recdeives a notification that a new iteration is started. + /// + /// The new iteration number + void OnNewIteration(int iteration); + + /// + /// Reconfigure the topology in response to some event. + /// + /// The task id responsible for the topology change + /// Some additional topology-specific information + /// The optional iteration number in which the event occurred + /// One or more messages for reconfiguring the tasks + IEnumerable Reconfigure(string taskId,string info = null, int? iteration = null); + + /// + /// Log the final statistics of the operator. + /// This is called when the pipeline execution is completed. + /// + string LogFinalStatistics(); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/DataNode.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/DataNode.cs new file mode 100644 index 0000000000..060b8a34e9 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/DataNode.cs @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum; +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Logical.Impl +{ + /// + /// Represents a node in the operator topology graph. + /// Nodes are logical representations in the Driver for tasks. + /// + [Unstable("0.16", "API may change")] + internal sealed class DataNode + { + private readonly bool _isRoot; + + /// + /// Construct a node using a given task id. + /// + /// The id for the node + /// Whether the node is the root/master of the topology or not + public DataNode( + int taskId, + bool isRoot) + { + TaskId = taskId; + _isRoot = isRoot; + } + + /// + /// The current state for the node. + /// + public DataNodeState FailState { get; set; } = DataNodeState.Reachable; + + /// + /// The parent of the target node. + /// + public DataNode Parent { get; set; } + + /// + /// Add a node to the list of children nodes of the current one. + /// + public void AddChildren(IEnumerable child) + { + Children.AddRange(child); + } + + /// + /// The task id represented by the data node. + /// + public int TaskId { get; } + + /// + /// Return how many children the current node has. + /// + public int NumberOfChildren => Children.Count; + + /// + /// Return the list of children fro the current node. + /// + public List Children { get; } = new List(); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/EmptyTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/EmptyTopology.cs new file mode 100644 index 0000000000..2547daa0c1 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/EmptyTopology.cs @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Exceptions; +using System.Collections.Generic; +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Tang.Implementations.Tang; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Logical.Impl +{ + /// + /// Topology with no structure. + /// Used as a placeholder when no topology is required. + /// + [Unstable("0.16", "API may change")] + internal class EmptyTopology : ITopology + { + private bool _finalized = false; + + /// + /// The identifier of the operator using the topology. + /// + public int OperatorId { get; set; } = -1; + + /// + /// The stage of the operator using the topology. + /// + public string StageName { get; set; } + + /// + /// Adds a new task to the topology. + /// This method does nothing on the empty topology. + /// + /// The id of the task to be added + /// The failure machine that manage the failure for the operator. + /// This method returns always false + public bool AddTask(string taskId, IFailureStateMachine failureMachine) + { + return false; + } + + /// + /// Removes a task from the topology. + /// This method does nothing on the empty topology. + /// + /// The id of the task to be removed + /// This method return always 0 + public int RemoveTask(string taskId) + { + return 0; + } + + /// + /// Whether the topology can be sheduled. + /// + /// This method return always true + public bool CanBeScheduled() + { + return true; + } + + /// + /// Finalizes the topology. + /// + /// The same finalized topology + public ITopology Build() + { + if (_finalized) + { + throw new IllegalStateException("Topology cannot be built more than once"); + } + + if (OperatorId <= 0) + { + throw new IllegalStateException("Topology cannot be built because not linked to any operator"); + } + + if (StageName == null) + { + throw new IllegalStateException("Topology cannot be built because not linked to any stage"); + } + + _finalized = true; + + return this; + } + + /// + /// Adds the topology configuration for the input task to the input builder. + /// This method does nothig. + /// + /// The task id of the task that belongs to this Topology + /// The task configuration + public IConfiguration GetTaskConfiguration(int taskId) + { + return TangFactory.GetTang().NewConfigurationBuilder().Build(); + } + + /// + /// Utility method for logging the topology state. + /// This will be called every time a topology object is built or modified + /// because of a failure. + /// + public string LogTopologyState() + { + return "empty"; + } + + /// + /// This method is triggered when a node detects a change in the topology and asks the driver for an update. + /// + /// The identifier of the task asking for the update + /// An optional failure machine to log updates + public IEnumerable TopologyUpdateResponse(string taskId, Optional failureStateMachine) + { + return new IElasticDriverMessage[] { }; + } + + /// + /// Action to trigger when the operator recdeives a notification that a new iteration is started. + /// This method does nothing. + /// + /// The new iteration number + public void OnNewIteration(int iteration) + { + } + + /// + /// Reconfigure the topology in response to some event. + /// + /// The task id responsible for the topology change + /// Some additional topology-specific information + /// The optional iteration number in which the event occurred + /// An empty list of messages + public IEnumerable Reconfigure(string taskId, string info = null, int? iteration = null) + { + return new IElasticDriverMessage[] { }; + } + + /// + /// Log the final statistics of the operator. + /// This is called when the pipeline execution is completed. + /// + public string LogFinalStatistics() + { + return string.Empty; + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs new file mode 100644 index 0000000000..8f0bd0decb --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs @@ -0,0 +1,407 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Interface; +using System.Collections.Generic; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Utilities.Logging; +using System.Linq; +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum; +using Org.Apache.REEF.Tang.Implementations.Tang; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Logical.Impl +{ + /// + /// Topology class for N nodes organized as a shallow tree with 1 root (the master) and N-1 nodes + /// connected to it. + /// + [Unstable("0.16", "API may change")] + public class FlatTopology : ITopology + { + private static readonly Logger Log = Logger.GetLogger(typeof(FlatTopology)); + + private string _rootTaskId = null; + private readonly int _rootId; + private string _taskStage = null; + private volatile int _iteration = 1; + private bool _finalized = false; + + private readonly IDictionary _nodes; + private DataNode _root; // This is just for caching + private readonly HashSet _lostNodesToBeRemoved = new HashSet(); + private HashSet _nodesWaitingToJoinTopologyNextIteration = new HashSet(); + private HashSet _nodesWaitingToJoinTopology = new HashSet(); + + private volatile int _availableDataPoints = 0; + private int _totalDataPoints = 0; + + private readonly object _lock = new object(); + + /// + /// Constructor for flat topology. After construction the graph is empty + /// and tasks need to be added. + /// + /// The id of the task that will be set as root of the topology + /// Whether the leaf nodes need to be ordered or not + public FlatTopology(int rootId, bool sorted = false) + { + _rootId = rootId; + OperatorId = -1; + + if (sorted) + { + _nodes = new SortedDictionary(); + } + else + { + _nodes = new Dictionary(); + } + } + + /// + /// The identifier of the operator using the topology. + /// + public int OperatorId { get; set; } + + /// + /// The stage of the operator using the topology. + /// + public string StageName { get; set; } + + /// + /// Adds a new task to the topology. + /// When called before Build() actually adds the task to the topology. + /// After Build(), it assumes that the task is added because recovered from a failure. + /// A failure machine is given as input so that the topology can update the number of available nodes. + /// + /// The id of the task to be added + /// The failure machine that manage the failure for the operator. + /// True if is the first time the topology sees this task + public bool AddTask(string taskId, IFailureStateMachine failureMachine) + { + if (string.IsNullOrEmpty(taskId)) + { + throw new ArgumentNullException(taskId); + } + + if (failureMachine == null) + { + throw new ArgumentNullException(nameof(failureMachine)); + } + + var id = Utils.GetTaskNum(taskId); + + lock (_lock) + { + if (_nodes.TryGetValue(id, out DataNode node)) + { + if (node.FailState != DataNodeState.Reachable) + { + // This is node already added to the topology and which probably failed. + _nodesWaitingToJoinTopologyNextIteration.Add(taskId); + node.FailState = DataNodeState.Unreachable; + return false; + } + + throw new ArgumentException("Task already added to the topology."); + } + + DataNode dnode = new DataNode(id, false); + _nodes[id] = dnode; + + if (_finalized) + { + // New node but elastically added. It should be gracefully added to the topology. + _nodesWaitingToJoinTopologyNextIteration.Add(taskId); + dnode.FailState = DataNodeState.Unreachable; + _root.Children.Add(dnode); + failureMachine.AddDataPoints(1, true); + failureMachine.RemoveDataPoints(1); + return false; + } + + // This is required later in order to build the topology + if (_taskStage == string.Empty) + { + _taskStage = Utils.GetTaskStages(taskId); + } + } + + _availableDataPoints++; + failureMachine.AddDataPoints(1, true); + + return true; + } + + /// + /// Removes a task from the topology. + /// + /// The id of the task to be removed + /// The number of data points lost because of the removed task + public int RemoveTask(string taskId) + { + if (string.IsNullOrEmpty(taskId)) + { + throw new ArgumentNullException(nameof(taskId)); + } + + if (taskId == _rootTaskId) + { + throw new NotImplementedException("Failure on master not supported yet"); + } + + var id = Utils.GetTaskNum(taskId); + + lock (_lock) + { + if (!_nodes.TryGetValue(id, out DataNode node)) + { + throw new ArgumentException("Task is not part of this topology"); + } + + var prevState = node.FailState; + node.FailState = DataNodeState.Lost; + _nodesWaitingToJoinTopologyNextIteration.Remove(taskId); + _nodesWaitingToJoinTopology.Remove(taskId); + _lostNodesToBeRemoved.Add(taskId); + + if (prevState != DataNodeState.Reachable) + { + return 0; + } + + _availableDataPoints--; + } + + return 1; + } + + /// + /// Whether the topology can be sheduled. + /// + /// True if the topology is ready to be scheduled + public bool CanBeScheduled() + { + return _root != null; + } + + /// + /// Finalizes the topology. + /// After the topology has been finalized, any task added to the topology is + /// assumed as a task recovered from a failure. + /// + /// The same finalized topology + public ITopology Build() + { + if (_finalized) + { + throw new IllegalStateException("Topology cannot be built more than once"); + } + + if (_root == null) + { + throw new IllegalStateException("Topology cannot be built because the root node is missing"); + } + + if (OperatorId <= 0) + { + throw new IllegalStateException("Topology cannot be built because not linked to any operator"); + } + + if (StageName == string.Empty) + { + throw new IllegalStateException("Topology cannot be built because not linked to any stage"); + } + + BuildTopology(); + + _rootTaskId = Utils.BuildTaskId(_taskStage, _rootId); + _finalized = true; + + return this; + } + + /// + /// Utility method for logging the topology state. + /// This will be called every time a topology object is built or modified + /// because of a failure. + /// + public string LogTopologyState() + { + return _rootId + "\n" + string.Join(" ", _root.Children.Select(node => + node.FailState == DataNodeState.Reachable ? "" + node.TaskId : "X")); + } + + /// + /// Adds the topology configuration for the input task to the input builder. + /// Must be called only after all tasks have been added to the topology, i.e., after build. + /// + /// The task id of the task that belongs to this Topology + /// The task configuration + public IConfiguration GetTaskConfiguration(int taskId) + { + if (!_finalized) + { + throw new IllegalStateException("Cannot get task configuration from a not finalized topology."); + } + + var confBuilder = TangFactory.GetTang().NewConfigurationBuilder(); + + if (taskId == _rootId) + { + foreach (var tId in _root.Children) + { + confBuilder.BindSetEntry("" + tId.TaskId); + } + } + + return confBuilder + .BindNamedParam("" + _rootId) + .Build(); + } + + /// + /// This method is triggered when a node contacts the driver to synchronize the remote topology + /// with the driver's one. + /// + /// The identifier of the task asking for the update + /// An optional failure machine to log updates + public IEnumerable TopologyUpdateResponse( + string taskId, + Optional failureStateMachine) + { + if (taskId != _rootTaskId) + { + throw new IllegalStateException("Only root tasks are supposed to request topology updates."); + } + + if (!failureStateMachine.IsPresent()) + { + throw new IllegalStateException("Cannot update topology without failure machine."); + } + + lock (_lock) + { + var update = new TopologyUpdate(_rootTaskId, _nodesWaitingToJoinTopology); + var data = new UpdateMessagePayload( new[]{ update }, StageName, OperatorId, _iteration); + var returnMessage = new ElasticDriverMessageImpl(_rootTaskId, data); + + if (_nodesWaitingToJoinTopology.Count > 0) + { + if (Log.IsLoggable(Level.Info)) + { + Log.Log(Level.Info, + "Tasks [{0}] are added to topology in iteration {1}", + string.Join(",", _nodesWaitingToJoinTopology), + _iteration); + } + + _availableDataPoints += _nodesWaitingToJoinTopology.Count; + failureStateMachine.Value.AddDataPoints(_nodesWaitingToJoinTopology.Count, false); + + foreach (var node in _nodesWaitingToJoinTopology) + { + var id = Utils.GetTaskNum(node); + _nodes[id].FailState = DataNodeState.Reachable; + } + + _nodesWaitingToJoinTopology.Clear(); + } + + return new[] { returnMessage }; + } + } + + /// + /// Action to trigger when the operator receives a notification that a new iteration is started. + /// + /// The new iteration number + public void OnNewIteration(int iteration) + { + Log.Log(Level.Info, + "Flat Topology for Operator {0} in Iteration {1} is closed with {2} nodes", + OperatorId, + iteration - 1, + _availableDataPoints); + _iteration = iteration; + _totalDataPoints += _availableDataPoints; + + lock (_lock) + { + _nodesWaitingToJoinTopology = _nodesWaitingToJoinTopologyNextIteration; + _nodesWaitingToJoinTopologyNextIteration = new HashSet(); + } + } + + /// + /// Reconfigure the topology in response to some event. + /// + /// The task id responsible for the topology change + /// Some additional topology-specific information + /// The optional iteration number in which the event occurred + /// One or more messages for reconfiguring the Tasks + public IEnumerable Reconfigure( + string taskId, + string info = null, + int? iteration = null) + { + if (taskId == _rootTaskId) + { + throw new NotImplementedException("Failure on master not supported yet."); + } + + List messages = new List(); + + lock (_lock) + { + var update = new TopologyUpdate(_rootTaskId, _lostNodesToBeRemoved); + var data = new FailureMessagePayload(new[] { update }, StageName, OperatorId, -1); + var returnMessage = new ElasticDriverMessageImpl(_rootTaskId, data); + + Log.Log(Level.Info, "Task {0} is removed from topology", taskId); + messages.Add(returnMessage); + _lostNodesToBeRemoved.Clear(); + } + + return messages; + } + + /// + /// Log the final statistics of the operator. + /// This is called when the pipeline execution is completed. + /// + public string LogFinalStatistics() + { + return string.Format( + "\nAverage number of nodes in the topology of Operator {0}: {1}", + OperatorId, + _iteration >= 2 ? (float)_totalDataPoints / (_iteration - 1) : _availableDataPoints); + } + + private void BuildTopology() + { + _root = _nodes[_rootId]; + _root.AddChildren(_nodes.Values.Where(n => n.TaskId != _rootId)); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs new file mode 100644 index 0000000000..f8130a1aac --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs @@ -0,0 +1,148 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Config; +using Org.Apache.REEF.Network.Elastic.Task.Impl; +using Org.Apache.REEF.Tang.Annotations; +using System.Collections.Generic; +using Org.Apache.REEF.Common.Tasks; +using System.Threading; +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Utilities.Logging; +using System.Linq; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Failures; +using System; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Default +{ + /// + /// Topology class managing data communication for broadcast operators. + /// + [Unstable("0.16", "API may change")] + internal sealed class DefaultBroadcastTopology : OneToNTopology + { + [Inject] + private DefaultBroadcastTopology( + [Parameter(typeof(OperatorParameters.StageName))] string stageName, + [Parameter(typeof(OperatorParameters.TopologyRootTaskId))] int rootId, + [Parameter(typeof(OperatorParameters.TopologyChildTaskIds))] ISet children, + [Parameter(typeof(OperatorParameters.PiggybackTopologyUpdates))] bool piggyback, + [Parameter(typeof(OperatorParameters.OperatorId))] int operatorId, + [Parameter(typeof(TaskConfigurationOptions.Identifier))] string taskId, + [Parameter(typeof(GroupCommunicationConfigurationOptions.Retry))] int retry, + [Parameter(typeof(GroupCommunicationConfigurationOptions.Timeout))] int timeout, + [Parameter(typeof(GroupCommunicationConfigurationOptions.DisposeTimeout))] int disposeTimeout, + DefaultCommunicationLayer commLayer) : base( + stageName, + taskId, + Utils.BuildTaskId(stageName, rootId), + operatorId, + children, + piggyback, + retry, + timeout, + disposeTimeout, + commLayer) + { + } + + /// + /// Creates a DataMessage out of some input data. + /// + /// The iteration number of this message + /// The data to communicate + /// A properly configured DataMessage + public override DataMessage GetDataMessage(int iteration, T data) + { + if (_piggybackTopologyUpdates) + { + return new DataMessageWithTopology(StageName, OperatorId, iteration, data); + } + + return new DataMessage(StageName, OperatorId, iteration, data); + } + + /// + /// Creates a DataMessage out of some input data. + /// + /// The iteration number of this message + /// The data to communicate + /// A properly configured DataMessage + public override DataMessage GetDataMessage(int iteration, params T[] data) + { + throw new NotImplementedException("Broadcast is allowed to send only one piece of data at a time"); + } + + /// + /// Send a previously queued data message. + /// + /// The source in case the task is cancelled + protected override void Send(CancellationTokenSource cancellationSource) + { + int retry = 0; + + // Check if we have a message to send + if (_sendQueue.TryPeek(out ElasticGroupCommunicationMessage message)) + { + // Broadcast topology require the driver to send topology updates to the root node + // in order to have the most update topology at each boradcast round. + while (!_topologyUpdateReceived.WaitOne(_timeout)) + { + // If we are here, we weren't able to receive a topology update on time. Retry. + if (cancellationSource.IsCancellationRequested) + { + Log.Log(Level.Warning, "Received cancellation request: stop sending"); + return; + } + + retry++; + + if (retry > _retry) + { + throw new OperatorException( + $"Iteration {((DataMessage)message).Iteration}: " + + $"Failed to send message to the next node in the ring after {_retry} try." + , OperatorId); + } + + TopologyUpdateRequest(); + } + + // Get the actual message to send. Note that altough message sending is asynchronous, + // broadcast rounds should not overlap. + var canSend = _sendQueue.TryDequeue(out message); + + if (TaskId == RootTaskId) + { + // Prepare the mutex to block for the next round of topology updates. + _topologyUpdateReceived.Reset(); + } + + if (canSend) + { + // Deliver the message to the communication layer. + foreach (var destination in _children.Values.Except(_nodesToRemove.Keys)) + { + _commLayer.Send(destination, message, cancellationSource); + } + } + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs new file mode 100644 index 0000000000..a878925626 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs @@ -0,0 +1,275 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Task.Impl; +using System.Collections.Generic; +using System; +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Tang.Exceptions; +using System.Threading; +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.NetworkService; +using System.Collections.Concurrent; +using System.Linq; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Default +{ + /// + /// Base class for topologies following a one to N communication pattern. + /// + [Unstable("0.16", "API may change")] + internal abstract class OneToNTopology : OperatorTopologyWithDefaultCommunication + { + protected static readonly Logger Log = Logger.GetLogger(typeof(OneToNTopology)); + + protected readonly ConcurrentDictionary _nodesToRemove = + new ConcurrentDictionary(); + + protected readonly ManualResetEvent _topologyUpdateReceived; + protected readonly bool _piggybackTopologyUpdates; + + /// + /// Construct a one to N topology. + /// + /// The stage name the topology is working on + /// The identifier of the task the topology is running on + /// The identifier of the root note in the topology + /// The identifier of the operator for this topology + /// The list of nodes this task has to send messages to + /// Whether to piggyback topology update messages to data message + /// How many times the topology will retry to send a message + /// After how long the topology waits for an event + /// Maximum wait time for topology disposal + /// Layer responsible for communication + /// Layer responsible for saving and retrieving checkpoints + protected OneToNTopology( + string stageName, + string taskId, + string rootTaskId, + int operatorId, + IEnumerable children, + bool piggyback, + int retry, + int timeout, + int disposeTimeout, + DefaultCommunicationLayer commLayer) : base( + stageName, + taskId, + rootTaskId, + operatorId, + commLayer, + retry, + timeout, + disposeTimeout) + { + _topologyUpdateReceived = new ManualResetEvent(RootTaskId != taskId); + + _commLayer.RegisterOperatorTopologyForTask(this); + _commLayer.RegisterOperatorTopologyForDriver(this); + + _piggybackTopologyUpdates = piggyback; + + foreach (var child in children) + { + var childTaskId = Utils.BuildTaskId(StageName, child); + + _children.TryAdd(child, childTaskId); + } + } + + /// + /// Whether the topology is still sending messages or not. + /// + public bool IsSending + { + get { return !_sendQueue.IsEmpty; } + } + + /// + /// Waiting logic before disposing topologies. + /// + public void WaitCompletionBeforeDisposing(CancellationTokenSource cancellationSource) + { + if (TaskId == RootTaskId) + { + foreach (var node in _children.Values) + { + while (_commLayer.Lookup(node) && !cancellationSource.IsCancellationRequested) + { + Thread.Sleep(100); + } + } + } + } + + /// + /// Creates a DataMessage out of some input data. + /// + /// The iteration number of this message + /// The data to communicate + /// A properly configured DataMessage + public abstract DataMessage GetDataMessage(int iteration, T data); + + /// + /// Creates a DataMessage out of some input data. + /// + /// The iteration number of this message + /// The data to communicate + /// A properly configured DataMessage + public abstract DataMessage GetDataMessage(int iteration, params T[] data); + + /// + /// Initializes the communication group. + /// Computation blocks until all required tasks are registered in the group. + /// + /// The signal to cancel the operation + public override void WaitForTaskRegistration(CancellationTokenSource cancellationSource) + { + try + { + _commLayer.WaitForTaskRegistration(_children.Values, cancellationSource, _nodesToRemove); + } + catch (Exception e) + { + throw new IllegalStateException( + "Failed to find parent/children nodes in operator topology for node: " + TaskId, e); + } + + _initialized = true; + + Send(cancellationSource); + } + + /// + /// Handler for incoming messages from other topology nodes. + /// + /// The message that need to be devlivered to the operator + public override void OnNext(NsMessage message) + { + if (_messageQueue.IsAddingCompleted) + { + throw new IllegalStateException("Trying to add messages to a closed non-empty queue."); + } + + _messageQueue.Add(message.Data); + + if (_piggybackTopologyUpdates) + { + var topologyPayload = message.Data as DataMessageWithTopology; + var updates = topologyPayload.TopologyUpdates; + + UpdateTopology(ref updates); + topologyPayload.TopologyUpdates = updates; + } + + if (!_children.IsEmpty) + { + _sendQueue.Enqueue(message.Data); + } + + if (_initialized) + { + Send(_cancellationSignal); + } + } + + /// + /// Handler for messages coming from the driver. + /// + /// Message from the driver + public override void OnNext(DriverMessagePayload message) + { + switch (message.PayloadType) + { + case DriverMessagePayloadType.Failure: + { + var rmsg = message as TopologyMessagePayload; + + foreach (var updates in rmsg.TopologyUpdates) + { + foreach (var node in updates.Children) + { + Log.Log(Level.Info, "Removing task {0} from the topology.", node); + _nodesToRemove.TryAdd(node, 0); + _commLayer.RemoveConnection(node); + } + } + break; + } + case DriverMessagePayloadType.Update: + { + if (_sendQueue.Count > 0) + { + if (_sendQueue.TryPeek(out ElasticGroupCommunicationMessage toSendmsg)) + { + var rmsg = message as TopologyMessagePayload; + + if (_piggybackTopologyUpdates) + { + var toSendmsgWithTop = toSendmsg as DataMessageWithTopology; + var updates = rmsg.TopologyUpdates; + + UpdateTopology(ref updates); + toSendmsgWithTop.TopologyUpdates = updates; + } + + foreach (var taskId in _nodesToRemove.Keys) + { + _children.TryRemove(Utils.GetTaskNum(taskId), out string str); + } + _nodesToRemove.Clear(); + } + + // Unblock this broadcast round. + _topologyUpdateReceived.Set(); + } + else + { + Log.Log(Level.Warning, "Received a topology update message from driver " + + "but sending queue is empty: ignoring."); + } + } + break; + + default: + throw new ArgumentException( + $"Message type {message.PayloadType} not supported by N to one topologies."); + } + } + + private void UpdateTopology(ref List updates) + { + var update = updates.Find(elem => elem.Node == TaskId); + + if (update != null) + { + foreach (var child in update.Children) + { + if (!_nodesToRemove.TryRemove(child, out byte value)) + { + var id = Utils.GetTaskNum(child); + _children.TryAdd(id, child); + } + } + + updates.Remove(update); + } + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs new file mode 100644 index 0000000000..78115f71c6 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs @@ -0,0 +1,256 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Task.Impl; +using System; +using System.Collections.Concurrent; +using System.Threading; +using Org.Apache.REEF.Network.NetworkService; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Default +{ + /// + /// Base class for topologies where nodes are allowed to communicated between themselves + /// and to the driver. + /// + [Unstable("0.16", "API may change")] + internal abstract class OperatorTopologyWithDefaultCommunication : + DriverAwareOperatorTopology, + IOperatorTopologyWithCommunication + { + protected bool _initialized = false; + + protected DefaultCommunicationLayer _commLayer; + + protected readonly int _disposeTimeout; + protected readonly int _timeout; + protected readonly int _retry; + + protected readonly ConcurrentQueue _sendQueue = + new ConcurrentQueue(); + + protected readonly BlockingCollection _messageQueue = + new BlockingCollection(); + + protected readonly ConcurrentDictionary _children = new ConcurrentDictionary(); + protected readonly CancellationTokenSource _cancellationSignal = new CancellationTokenSource(); + + /// + /// Constructor for a communicating topology. + /// + /// The stage name the topology is working on + /// The identifier of the task the topology is running on + /// The identifier of the root note in the topology + /// The identifier of the operator for this topology + /// How many times the topology will retry to send a message + /// After how long the topology waits for an event + /// Maximum wait time for topology disposal + /// Class responsible for communication + protected OperatorTopologyWithDefaultCommunication( + string stageName, + string taskId, + string rootTaskId, + int operatorId, + DefaultCommunicationLayer commLayer, + int retry, + int timeout, + int disposeTimeout) : base(stageName, taskId, rootTaskId, operatorId) + { + _commLayer = commLayer; + + _retry = retry; + _timeout = timeout; + _disposeTimeout = disposeTimeout; + } + + /// + /// Communicate to the driver that the current subscrition has completed its + /// execution. + /// + public void StageComplete() + { + if (TaskId == RootTaskId) + { + _commLayer.StageComplete(TaskId, StageName); + } + } + + /// + /// Request a topology status update to the driver. + /// + public void TopologyUpdateRequest() + { + _commLayer.TopologyUpdateRequest(TaskId, StageName, OperatorId); + } + + /// + /// Waiting logic before disposing topologies. + /// + public override void WaitCompletionBeforeDisposing() + { + var tsEnd = DateTime.Now.AddMilliseconds(_disposeTimeout); + while (_sendQueue.Count > 0 && DateTime.Now < tsEnd) + { + // The topology is still trying to send messages, wait. + Thread.Sleep(100); + } + } + + /// + /// Signal the the current task is joining the topology. + /// + public virtual void JoinTopology() + { + _commLayer.JoinTopology(TaskId, StageName, OperatorId); + } + + /// + /// Initializes the communication group. + /// Computation blocks until all required tasks are registered in the group. + /// + /// The signal to cancel the operation + public virtual void WaitForTaskRegistration(CancellationTokenSource cancellationSource) + { + try + { + _commLayer.WaitForTaskRegistration(_children.Values, cancellationSource); + } + catch (Exception e) + { + throw new OperationCanceledException( + "Failed to find parent/children nodes in operator topology for node: " + TaskId, e); + } + + _initialized = true; + + // Some message may have been received while we were setting up the topology. Send them. + Send(cancellationSource); + } + + /// + /// Block and wait untill a message is received. + /// + /// The signal that the operation is cacelled + /// + public virtual ElasticGroupCommunicationMessage Receive(CancellationTokenSource cancellationSource) + { + for (int retry = 0; retry < _retry; ++retry) + { + if (_messageQueue.TryTake(out ElasticGroupCommunicationMessage message, _timeout, cancellationSource.Token)) + { + return message; + } + + if (cancellationSource.IsCancellationRequested) + { + throw new OperationCanceledException("Received cancellation request: stop receiving."); + } + } + + throw new TimeoutException($"Failed to receive message after {_retry} try."); + } + + /// + /// Send the input message. This method is asynchornous. + /// + /// The message to communicate + /// The signal for cancelling the operation + public virtual void Send(ElasticGroupCommunicationMessage message, CancellationTokenSource cancellationSource) + { + _sendQueue.Enqueue(message); + + if (_initialized) + { + Send(cancellationSource); + } + } + + /// + /// Handler for incoming messages from other topology nodes. + /// + /// The message that need to be devlivered to the operator + public virtual void OnNext(NsMessage message) + { + if (_messageQueue.IsAddingCompleted && _messageQueue.Count > 0) + { + throw new IllegalStateException("Trying to add messages to a closed non-empty queue."); + } + + _messageQueue.Add(message.Data); + + // Automatically forward the received message to the child nodes in the topology. + if (!_children.IsEmpty) + { + _sendQueue.Enqueue(message.Data); + } + + if (_initialized) + { + Send(_cancellationSignal); + } + } + + /// + /// Dispose the topology. + /// + public virtual void Dispose() + { + _messageQueue.CompleteAdding(); + + _cancellationSignal.Cancel(); + + _commLayer.Dispose(); + } + + /// + /// Logic to execute in case the observable sends an error event. + /// + /// The error throw on the observable. + public new void OnError(Exception error) + { + _messageQueue.CompleteAdding(); + } + + /// + /// Logic to execute in case the observable sends a complete event. + /// + /// + public new void OnCompleted() + { + _messageQueue.CompleteAdding(); + } + + /// + /// Send a previously queued data message. + /// + /// The singal in case the task is cancelled + protected virtual void Send(CancellationTokenSource cancellationSource) + { + while (_sendQueue.TryDequeue(out ElasticGroupCommunicationMessage message) && !cancellationSource.IsCancellationRequested) + { + foreach (var child in _children.Values) + { + _commLayer.Send(child, message, cancellationSource); + } + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/DriverAwareOperatorTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/DriverAwareOperatorTopology.cs new file mode 100644 index 0000000000..2923b5d4e3 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/DriverAwareOperatorTopology.cs @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Utilities.Attributes; +using System; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Physical +{ + /// + /// Abstract class for topologies able to receive messages from the driver. + /// + [Unstable("0.16", "API may change")] + public abstract class DriverAwareOperatorTopology : OperatorTopology, IObserver + { + /// + /// Constructor. + /// + /// The stage name the topology is working on + /// The identifier of the task the topology is running on + /// The identifier of the root note in the topology + /// The identifier of the operator for this topology + protected DriverAwareOperatorTopology(string stageName, string taskId, string rootTaskId, int operatorId) + : base(stageName, taskId, rootTaskId, operatorId) + { + } + + /// + /// Basic handler for messages coming from the driver. + /// + /// Message from the driver + public virtual void OnNext(DriverMessagePayload message) + { + switch (message.PayloadType) + { + case DriverMessagePayloadType.Ring: + case DriverMessagePayloadType.Resume: + case DriverMessagePayloadType.Update: + case DriverMessagePayloadType.Failure: + break; + default: + throw new ArgumentException($"Message type {message.PayloadType} not recognized."); + } + } + + #region Empty Handlers + + public void OnError(Exception error) + { + } + + public void OnCompleted() + { + } + #endregion + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/IOperatorTopologyWithCommunication.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/IOperatorTopologyWithCommunication.cs new file mode 100644 index 0000000000..314976d761 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/IOperatorTopologyWithCommunication.cs @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.NetworkService; +using Org.Apache.REEF.Utilities.Attributes; +using System; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Physical +{ + /// + /// Base interface for topologies where nodes communicate betwen themselves. + /// + [Unstable("0.16", "API may change")] + internal interface IOperatorTopologyWithCommunication : + IWaitForTaskRegistration, + INodeIdentifier, + IDisposable, + IObserver> + { + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/OperatorTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/OperatorTopology.cs new file mode 100644 index 0000000000..8e9bbc7ee5 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/OperatorTopology.cs @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Physical +{ + /// + /// Base class for task-side topologies. Task-side topologies are + /// not generic but directly related to the operators using them to communicate data. + /// + [Unstable("0.16", "API may change")] + public abstract class OperatorTopology : INodeIdentifier + { + /// + /// Constructor for an operator topology. + /// + /// The stage name the topology is working on + /// The identifier of the task the topology is running on + /// The identifier of the root note in the topology + /// The identifier of the operator for this topology + protected OperatorTopology(string stageName, string taskId, string rootTaskId, int operatorId) + { + StageName = stageName; + TaskId = taskId; + RootTaskId = rootTaskId; + OperatorId = operatorId; + } + + /// + /// The stage name context in which the topology is running. + /// + public string StageName { get; } + + /// + /// The identifier of the operator in which the topology is running. + /// + public int OperatorId { get; } + + /// + /// The identifier of the task in which the topology is running. + /// + protected string TaskId { get; } + + /// + /// The task identifier of the root node of the topology. + /// + protected string RootTaskId { get; set; } + + /// + /// Waiting logic before disposing topologies. + /// + public virtual void WaitCompletionBeforeDisposing() + { + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Utils.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Utils.cs new file mode 100644 index 0000000000..c9659f3e2d --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Utils.cs @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Driver.Context; +using Org.Apache.REEF.Utilities.Attributes; +using System; +using System.Globalization; + +namespace Org.Apache.REEF.Network.Elastic +{ + /// + /// Utility class. + /// + [Unstable("0.16", "API may change")] + public static class Utils + { + /// + /// Gets the context number associated with the active context id. + /// + /// The active context to check + /// The context number associated with the active context id + public static int GetContextNum(IActiveContext activeContext) + { + return int.Parse(GetValue(2, activeContext.Id), CultureInfo.InvariantCulture); + } + + /// + /// Gets the stages associated with the active context id. + /// + /// The active context to check + /// The stage names associated with the active context id + public static string GetContextStages(IActiveContext activeContext) + { + return GetValue(1, activeContext.Id); + } + + /// + /// Gets the stages associated with the context id. + /// + /// The context id to check + /// The stage names associated with the context id + public static string GetContextStages(string id) + { + return GetValue(1, id); + } + + /// + /// Gets the stages associated with the Task id. + /// + /// The task id to check + /// The stage names associated with the task id + public static string GetTaskStages(string taskId) + { + return GetValue(1, taskId); + } + + /// + /// Gets the task number associated with the Task id. + /// + /// The task id to check + /// The task number associated with the task id + public static int GetTaskNum(string taskId) + { + return int.Parse(GetValue(2, taskId), CultureInfo.InvariantCulture); + } + + /// + /// Builds a context identifier out of a stage(s) and a context number. + /// + /// The stages active in the context + /// The context number + /// The context identifier + public static string BuildContextId(string stageName, int contextNum) + { + return BuildIdentifier("Context", stageName, contextNum); + } + + /// + /// Builds a task identifier out of a stage(s) and an id. + /// + /// The stages active in the task + /// The task id + /// The task identifier + public static string BuildTaskId(string stageName, int id) + { + return BuildIdentifier("Task", stageName, id); + } + + /// + /// Gets the context associated with the task id. + /// + /// The task id to check + /// The context id associated with the task id + public static string GetContextIdFromTaskId(string taskId) + { + return taskId.Replace("Task", "Context"); + } + + /// + /// Utility method returning an identifier by merging the input fields + /// + /// The first field + /// The second field + /// The third field + /// An id merging the three fields + private static string BuildIdentifier(string first, string second, int third) + { + return $"{first}-{second}-{third}"; + } + + /// + /// Utility method returning a requested field out of an identifier + /// + /// The field of interest + /// The id to check + /// The field value extracted from the identifier + private static string GetValue(int field, string identifer) + { + string[] parts = identifer.Split('-'); + if (parts.Length != 3 || field < 0 || field > 2) + { + throw new ArgumentException("Invalid identifier"); + } + + return parts[field]; + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/GroupCommNetworkObserver.cs b/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/GroupCommNetworkObserver.cs index 8aca8028ae..88a8abc12d 100644 --- a/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/GroupCommNetworkObserver.cs +++ b/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/GroupCommNetworkObserver.cs @@ -80,7 +80,7 @@ public TaskMessageObserver RegisterAndGetForTask(string taskSourceId) public void OnNext(IRemoteMessage> remoteMessage) { var nsMessage = remoteMessage.Message; - var gcm = nsMessage.Data.First(); + var gcm = nsMessage.Data; var gcMessageTaskSource = gcm.Source; TaskMessageObserver observer; if (!_taskMessageObservers.TryGetValue(gcMessageTaskSource, out observer)) diff --git a/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/NodeMessageObserver.cs b/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/NodeMessageObserver.cs index 00ca1d47da..76839e72af 100644 --- a/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/NodeMessageObserver.cs +++ b/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/NodeMessageObserver.cs @@ -39,13 +39,10 @@ internal NodeMessageObserver(NodeStruct nodeStruct) /// public void OnNext(NsMessage value) { - foreach (var data in value.Data) + var gcMessage = value.Data as GroupCommunicationMessage; + if (gcMessage != null && gcMessage.Data != null && gcMessage.Data.Length > 0) { - var gcMessage = data as GroupCommunicationMessage; - if (gcMessage != null && gcMessage.Data != null && gcMessage.Data.Length > 0) - { - _nodeStruct.AddData(gcMessage); - } + _nodeStruct.AddData(gcMessage); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/TaskMessageObserver.cs b/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/TaskMessageObserver.cs index d8dd449c6e..a43021df34 100644 --- a/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/TaskMessageObserver.cs +++ b/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/TaskMessageObserver.cs @@ -120,7 +120,7 @@ private void Handle(NsMessage value, bool isRe return; } - var gcMessage = value.Data.First(); + var gcMessage = value.Data; IObserver> observer; if (!_observers.TryGetValue(NodeObserverIdentifier.FromMessage(gcMessage), out observer)) diff --git a/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageCodec.cs b/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageCodec.cs index ecb9b135d0..0fa83ef389 100644 --- a/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageCodec.cs +++ b/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageCodec.cs @@ -71,10 +71,9 @@ public NsMessage Decode(byte[] data) IIdentifier sourceId = _idFactory.Create(proto.SourceId); IIdentifier destId = _idFactory.Create(proto.DestId); - NsMessage message = new NsMessage(sourceId, destId); + var payload = _codec.Decode(proto.Data); - var messages = proto.Data.Select(byteArr => _codec.Decode(byteArr)); - message.Data.AddRange(messages); + NsMessage message = new NsMessage(sourceId, destId, payload); return message; } } diff --git a/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageProto.cs b/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageProto.cs index aad73d5ec2..20f79db0ef 100644 --- a/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageProto.cs +++ b/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageProto.cs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -using System.Collections.Generic; using Org.Apache.REEF.Wake.Remote; using ProtoBuf; @@ -26,7 +25,6 @@ public class NsMessageProto { public NsMessageProto() { - Data = new List(); } [ProtoMember(1)] @@ -36,7 +34,7 @@ public NsMessageProto() public string DestId { get; set; } [ProtoMember(3)] - public List Data { get; set; } + public byte[] Data { get; private set; } public static NsMessageProto Create(NsMessage message, ICodec codec) { @@ -45,10 +43,7 @@ public static NsMessageProto Create(NsMessage message, ICodec codec) proto.SourceId = message.SourceId.ToString(); proto.DestId = message.DestId.ToString(); - foreach (T item in message.Data) - { - proto.Data.Add(codec.Encode(item)); - } + proto.Data = codec.Encode(message.Data); return proto; } diff --git a/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageStreamingCodec.cs b/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageStreamingCodec.cs index 76ccbba93f..965ad9a5fe 100644 --- a/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageStreamingCodec.cs +++ b/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageStreamingCodec.cs @@ -58,19 +58,13 @@ public NsMessage Read(IDataReader reader) { int metadataSize = reader.ReadInt32(); byte[] metadata = new byte[metadataSize]; - reader.Read(ref metadata, 0, metadataSize); - var res = GenerateMetaDataDecoding(metadata); - Type messageType = res.Item3; - NsMessage message = res.Item1; + reader.Read(ref metadata, 0, metadataSize); + var res = GenerateMetaDataDecoding(metadata); + Type messageType = res.type; var codecReadFunc = _codecFunctionsCache.ReadFunction(messageType); - int messageCount = res.Item2; - - for (int i = 0; i < messageCount; i++) - { - message.Data.Add(codecReadFunc(reader)); - } + var message = new NsMessage(res.source, res.destination, codecReadFunc(reader)); return message; } @@ -87,13 +81,10 @@ public void Write(NsMessage obj, IDataWriter writer) byte[] totalEncoding = encodedInt.Concat(encodedMetadata).ToArray(); writer.Write(totalEncoding, 0, totalEncoding.Length); - Type messageType = obj.Data[0].GetType(); + Type messageType = obj.Data.GetType(); var codecWriteFunc = _codecFunctionsCache.WriteFunction(messageType); - - foreach (var data in obj.Data) - { - codecWriteFunc(data, writer); - } + + codecWriteFunc(obj.Data, writer); } /// @@ -108,15 +99,9 @@ public async Task> ReadAsync(IDataReader reader, CancellationToken byte[] metadata = new byte[metadataSize]; await reader.ReadAsync(metadata, 0, metadataSize, token); var res = GenerateMetaDataDecoding(metadata); - Type messageType = res.Item3; - NsMessage message = res.Item1; + Type messageType = res.type; var codecReadFunc = _codecFunctionsCache.ReadAsyncFunction(messageType); - int messageCount = res.Item2; - - for (int i = 0; i < messageCount; i++) - { - message.Data.Add(codecReadFunc(reader, token)); - } + var message = new NsMessage(res.source, res.destination, codecReadFunc(reader, token)); return message; } @@ -134,15 +119,12 @@ public async Task WriteAsync(NsMessage obj, IDataWriter writer, CancellationT byte[] totalEncoding = encodedInt.Concat(encodedMetadata).ToArray(); await writer.WriteAsync(totalEncoding, 0, totalEncoding.Length, token); - Type messageType = obj.Data[0].GetType(); + Type messageType = obj.Data.GetType(); var codecWriteFunc = _codecFunctionsCache.WriteAsyncFunction(messageType); - foreach (var data in obj.Data) - { - var asyncResult = codecWriteFunc.BeginInvoke(data, writer, token, null, null); - await codecWriteFunc.EndInvoke(asyncResult); - } + var asyncResult = codecWriteFunc.BeginInvoke(obj.Data, writer, token, null, null); + await codecWriteFunc.EndInvoke(asyncResult); } private static byte[] GenerateMetaDataEncoding(NsMessage obj) @@ -150,8 +132,7 @@ private static byte[] GenerateMetaDataEncoding(NsMessage obj) List metadataBytes = new List(); byte[] sourceBytes = StringToBytes(obj.SourceId.ToString()); byte[] dstBytes = StringToBytes(obj.DestId.ToString()); - byte[] messageTypeBytes = StringToBytes(obj.Data[0].GetType().AssemblyQualifiedName); - byte[] messageCount = BitConverter.GetBytes(obj.Data.Count); + byte[] messageTypeBytes = StringToBytes(obj.Data.GetType().AssemblyQualifiedName); metadataBytes.Add(BitConverter.GetBytes(sourceBytes.Length)); metadataBytes.Add(BitConverter.GetBytes(dstBytes.Length)); @@ -159,12 +140,11 @@ private static byte[] GenerateMetaDataEncoding(NsMessage obj) metadataBytes.Add(sourceBytes); metadataBytes.Add(dstBytes); metadataBytes.Add(messageTypeBytes); - metadataBytes.Add(messageCount); return metadataBytes.SelectMany(i => i).ToArray(); } - private Tuple, int, Type> GenerateMetaDataDecoding(byte[] obj) + private (IIdentifier source, IIdentifier destination, Type type) GenerateMetaDataDecoding(byte[] obj) { int srcCount = BitConverter.ToInt32(obj, 0); int dstCount = BitConverter.ToInt32(obj, sizeof(int)); @@ -177,10 +157,8 @@ private Tuple, int, Type> GenerateMetaDataDecoding(byte[] obj) offset += dstCount; Type msgType = Type.GetType(BytesToString(obj.Skip(offset).Take(msgTypeCount).ToArray())); offset += msgTypeCount; - int messageCount = BitConverter.ToInt32(obj, offset); - NsMessage msg = new NsMessage(_idFactory.Create(srcString), _idFactory.Create(dstString)); - return new Tuple, int, Type>(msg, messageCount, msgType); + return (source: _idFactory.Create(srcString), destination: _idFactory.Create(dstString), type: msgType); } private static byte[] StringToBytes(string str) diff --git a/lang/cs/Org.Apache.REEF.Network/NetworkService/IConnection.cs b/lang/cs/Org.Apache.REEF.Network/NetworkService/IConnection.cs index 70f814e827..f90df23360 100644 --- a/lang/cs/Org.Apache.REEF.Network/NetworkService/IConnection.cs +++ b/lang/cs/Org.Apache.REEF.Network/NetworkService/IConnection.cs @@ -24,6 +24,11 @@ namespace Org.Apache.REEF.Network.NetworkService /// public interface IConnection : IDisposable { + /// + /// Whether the connection is open or not. + /// + bool IsOpen { get; } + /// /// Opens the connection /// diff --git a/lang/cs/Org.Apache.REEF.Network/NetworkService/NsConnection.cs b/lang/cs/Org.Apache.REEF.Network/NetworkService/NsConnection.cs index 6355ed5f91..fc99f35e54 100644 --- a/lang/cs/Org.Apache.REEF.Network/NetworkService/NsConnection.cs +++ b/lang/cs/Org.Apache.REEF.Network/NetworkService/NsConnection.cs @@ -40,7 +40,6 @@ public class NsConnection : IConnection private readonly IIdentifier _destId; private readonly INameClient _nameClient; private readonly IRemoteManager> _remoteManager; - private readonly Dictionary> _connectionMap; private IObserver> _remoteSender; /// @@ -63,9 +62,14 @@ public NsConnection( _destId = destId; _nameClient = nameClient; _remoteManager = remoteManager; - _connectionMap = connectionMap; + IsOpen = false; } + /// + /// Whether the connection is open or not. + /// + public bool IsOpen { get; private set; } + /// /// Opens the connection to the remote host. /// @@ -83,6 +87,7 @@ public void Open() try { _remoteSender = _remoteManager.GetRemoteObserver(destAddr); + IsOpen = true; LOGGER.Log(Level.Verbose, "Network service completed connection to {0}.", destStr); } catch (SocketException) @@ -129,7 +134,12 @@ public void Write(T message) /// public void Dispose() { - _connectionMap.Remove(_destId); + if (_remoteSender != null) + { + IsOpen = false; + var disposable = _remoteSender as IDisposable; + disposable.Dispose(); + } } } } diff --git a/lang/cs/Org.Apache.REEF.Network/NetworkService/NsMessage.cs b/lang/cs/Org.Apache.REEF.Network/NetworkService/NsMessage.cs index 839af414bf..63fe3df40e 100644 --- a/lang/cs/Org.Apache.REEF.Network/NetworkService/NsMessage.cs +++ b/lang/cs/Org.Apache.REEF.Network/NetworkService/NsMessage.cs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -using System.Collections.Generic; using Org.Apache.REEF.Wake; namespace Org.Apache.REEF.Network.NetworkService @@ -26,18 +25,6 @@ namespace Org.Apache.REEF.Network.NetworkService /// The type of data being sent public class NsMessage { - /// - /// Create a new NsMessage with no data. - /// - /// The identifier of the sender - /// The identifier of the receiver - public NsMessage(IIdentifier sourceId, IIdentifier destId) - { - SourceId = sourceId; - DestId = destId; - Data = new List(); - } - /// /// Create a new NsMessage with data. /// @@ -48,7 +35,7 @@ public NsMessage(IIdentifier sourceId, IIdentifier destId, T message) { SourceId = sourceId; DestId = destId; - Data = new List { message }; + Data = message; } /// @@ -64,6 +51,6 @@ public NsMessage(IIdentifier sourceId, IIdentifier destId, T message) /// /// A list of data being sent in the message. /// - public List Data { get; private set; } + public T Data { get; private set; } } } diff --git a/lang/cs/Org.Apache.REEF.Network/NetworkService/StreamingNetworkService.cs b/lang/cs/Org.Apache.REEF.Network/NetworkService/StreamingNetworkService.cs index a34e8cb7ea..9a8e2208cc 100644 --- a/lang/cs/Org.Apache.REEF.Network/NetworkService/StreamingNetworkService.cs +++ b/lang/cs/Org.Apache.REEF.Network/NetworkService/StreamingNetworkService.cs @@ -56,12 +56,11 @@ public class StreamingNetworkService : INetworkService /// The local address provider [Inject] private StreamingNetworkService( - IObserver> universalObserver, INameClient nameClient, StreamingRemoteManagerFactory remoteManagerFactory, NsMessageStreamingCodec codec, ILocalAddressProvider localAddressProvider) - : this(universalObserver, null, nameClient, remoteManagerFactory, codec, localAddressProvider) + : this(null, null, nameClient, remoteManagerFactory, codec, localAddressProvider) { } @@ -162,6 +161,20 @@ public IConnection NewConnection(IIdentifier destinationId) } } + /// + /// Remove the connection to the destination node from the connection map. + /// + /// The id of the node to disconnect + public void RemoveConnection(IIdentifier destinationId) + { + IConnection connection; + if (_connectionMap.TryGetValue(destinationId, out connection)) + { + connection.Dispose(); + _connectionMap.Remove(destinationId); + } + } + /// /// Register the identifier for the NetworkService with the NameService. /// diff --git a/lang/cs/Org.Apache.REEF.Network/Org.Apache.REEF.Network.csproj b/lang/cs/Org.Apache.REEF.Network/Org.Apache.REEF.Network.csproj index 2a387432ea..2f3f4e4e39 100644 --- a/lang/cs/Org.Apache.REEF.Network/Org.Apache.REEF.Network.csproj +++ b/lang/cs/Org.Apache.REEF.Network/Org.Apache.REEF.Network.csproj @@ -31,12 +31,13 @@ under the License. + - + diff --git a/lang/cs/Org.Apache.REEF.Network/Utilities/Utils.cs b/lang/cs/Org.Apache.REEF.Network/Utilities/Utils.cs index 5ba1eb0a80..7c15a1a581 100644 --- a/lang/cs/Org.Apache.REEF.Network/Utilities/Utils.cs +++ b/lang/cs/Org.Apache.REEF.Network/Utilities/Utils.cs @@ -27,7 +27,7 @@ namespace Org.Apache.REEF.Network.Utilities { internal class Utils { - private static readonly Logger LOGGER = Logger.GetLogger(typeof(Utils)); + private static readonly Logger Log = Logger.GetLogger(typeof(Utils)); /// /// Returns the TaskIdentifier from the Configuration. @@ -44,7 +44,7 @@ public static string GetTaskId(IConfiguration taskConfiguration) } catch (InjectionException) { - LOGGER.Log(Level.Error, "Unable to find task identifier"); + Log.Log(Level.Error, "Unable to find task identifier"); throw; } } @@ -64,7 +64,7 @@ public static string GetContextId(IConfiguration contextConfiguration) } catch (InjectionException) { - LOGGER.Log(Level.Error, "Unable to find task identifier"); + Log.Log(Level.Error, "Unable to find task identifier"); throw; } } diff --git a/lang/cs/Org.Apache.REEF.Tang/Formats/AvroConfigurationSerializer.cs b/lang/cs/Org.Apache.REEF.Tang/Formats/AvroConfigurationSerializer.cs index 565d4fdd57..86df2bea29 100644 --- a/lang/cs/Org.Apache.REEF.Tang/Formats/AvroConfigurationSerializer.cs +++ b/lang/cs/Org.Apache.REEF.Tang/Formats/AvroConfigurationSerializer.cs @@ -15,13 +15,6 @@ // specific language governing permissions and limitations // under the License. -using System; -using System.Collections; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Runtime.Serialization; -using System.Text; using Microsoft.Hadoop.Avro; using Microsoft.Hadoop.Avro.Container; using Newtonsoft.Json; @@ -34,6 +27,13 @@ using Org.Apache.REEF.Tang.Types; using Org.Apache.REEF.Tang.Util; using Org.Apache.REEF.Utilities.Logging; +using System; +using System.Collections; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Runtime.Serialization; +using System.Text; namespace Org.Apache.REEF.Tang.Formats { @@ -99,7 +99,7 @@ public void ToFile(IConfiguration c, string fileName) var e = new TangApplicationException("Error during file operation. Quitting method: " + fileName); Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(e, LOGGER); } - } + } } public IConfiguration FromByteArray(byte[] bytes) @@ -178,7 +178,7 @@ public AvroConfiguration AvroDeserializeFromFile(string fileName) } buffer.Seek(0, SeekOrigin.Begin); - using (var reader = new SequentialReader(AvroContainer.CreateReader(buffer, true))) + using (var reader = new SequentialReader(AvroContainer.CreateReader(buffer, true))) { var results = reader.Objects; @@ -254,15 +254,35 @@ public AvroConfiguration ToAvroConfiguration(IConfiguration c) } else { - Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(new IllegalStateException(), LOGGER); + throw new TangApplicationException("Unable to serialize set of type {e.Value.GetType()}"); } l.Add(new ConfigurationEntry(e.Key.GetFullName(), val)); } + foreach (var kvp in conf.GetBoundList()) + { + foreach (var item in kvp.Value) + { + string val = null; + if (item is string) + { + val = (string)item; + } + else if (item is INode) + { + val = ((INode)item).GetFullName(); + } + else + { + throw new TangApplicationException("Unable to serialize list of type {item.GetType()}"); + } + l.Add(new ConfigurationEntry(kvp.Key.GetFullName(), val)); + } + } return new AvroConfiguration(Language.Cs.ToString(), l); } - + private byte[] AvroSerialize(AvroConfiguration obj) { var serializer = AvroSerializer.Create(); @@ -327,7 +347,7 @@ private IConfiguration AddFromAvro(IConfigurationBuilder cb, AvroConfiguration a { settings.Add(new KeyValuePair(e.key, e.value)); } - ConfigurationFile.ProcessConfigData(cb, settings, avroConfiguration.language); + ConfigurationFile.ProcessConfigData(cb, settings, avroConfiguration.language); return cb.Build(); } } diff --git a/lang/cs/Org.Apache.REEF.Tang/Formats/ConfigurationFile.cs b/lang/cs/Org.Apache.REEF.Tang/Formats/ConfigurationFile.cs index 01a20c8a26..86a31b615e 100644 --- a/lang/cs/Org.Apache.REEF.Tang/Formats/ConfigurationFile.cs +++ b/lang/cs/Org.Apache.REEF.Tang/Formats/ConfigurationFile.cs @@ -15,12 +15,6 @@ // specific language governing permissions and limitations // under the License. -using System; -using System.Collections; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; using Org.Apache.REEF.Tang.Exceptions; using Org.Apache.REEF.Tang.Implementations.Configuration; using Org.Apache.REEF.Tang.Implementations.Tang; @@ -28,6 +22,12 @@ using Org.Apache.REEF.Tang.Types; using Org.Apache.REEF.Tang.Util; using Org.Apache.REEF.Utilities.Logging; +using System; +using System.Collections; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; namespace Org.Apache.REEF.Tang.Formats { @@ -47,10 +47,10 @@ public static void WriteConfigurationFile(IConfiguration c, string fileName) } } - public static string ToConfigurationString(IConfiguration c) + public static string ToConfigurationString(IConfiguration c) { StringBuilder sb = new StringBuilder(); - foreach (string s in ToConfigurationStringList(c)) + foreach (string s in ToConfigurationStringList(c)) { sb.Append(s); sb.Append('\n'); @@ -64,7 +64,7 @@ private static string GetFullName(INode n) Type t = ReflectionUtilities.GetTypeByName(s); return t.FullName; } - + private static string GetFullName(string name) { try @@ -93,20 +93,20 @@ private static string GetAssemblyName(string s) } } - public static HashSet ToConfigurationStringList(IConfiguration c) + public static HashSet ToConfigurationStringList(IConfiguration c) { ConfigurationImpl conf = (ConfigurationImpl)c; HashSet l = new HashSet(); - foreach (IClassNode opt in conf.GetBoundImplementations()) + foreach (IClassNode opt in conf.GetBoundImplementations()) { l.Add(GetFullName(opt) + '=' + Escape(GetFullName(conf.GetBoundImplementation(opt)))); } - - foreach (IClassNode opt in conf.GetBoundConstructors()) + + foreach (IClassNode opt in conf.GetBoundConstructors()) { l.Add(GetFullName(opt) + '=' + Escape(GetFullName(conf.GetBoundConstructor(opt)))); } - foreach (INamedParameterNode opt in conf.GetNamedParameters()) + foreach (INamedParameterNode opt in conf.GetNamedParameters()) { l.Add(GetFullName(opt) + '=' + Escape(GetFullName(conf.GetNamedParameter(opt)))); } @@ -123,22 +123,43 @@ public static HashSet ToConfigurationStringList(IConfiguration c) KeyValuePair e = (KeyValuePair)bs.Current; string val = null; - if (e.Value is string) + if (e.Value is string) { val = GetFullName((string)e.Value); - } - else if (e.Value is INode) + } + else if (e.Value is INode) { val = GetFullName((INode)e.Value); - } - else + } + else { - Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(new IllegalStateException(), LOGGER); + throw new BindException($"Failed to serialize set of unsupported type {e.Value.GetType()}"); } - + l.Add(GetFullName(e.Key) + '=' + Escape(val)); } + foreach (var kvp in conf.GetBoundList()) + { + foreach (var item in kvp.Value) + { + string val = null; + if (item is string) + { + val = GetFullName((string)item); + } + else if (kvp.Value is INode) + { + val = GetFullName((INode)kvp.Value); + } + else + { + throw new BindException($"Failed to serialize list of unsupported type {item.GetType()}"); + } + l.Add(GetFullName(kvp.Key) + '=' + Escape(val)); + } + } + return l; } @@ -159,12 +180,12 @@ public static void AddConfigurationFromStream(IConfigurationBuilder conf, byte[] { using (StreamReader reader = new StreamReader(new MemoryStream(configData), Encoding.GetEncoding(0))) { - AddConfiguration(conf, reader); + AddConfiguration(conf, reader); } } public static void AddConfigurationFromFile(IConfigurationBuilder conf, string configFileName) - { + { using (StreamReader reader = File.OpenText(configFileName)) { AddConfiguration(conf, reader); @@ -188,7 +209,7 @@ private static void AddConfiguration(IConfigurationBuilder conf, StreamReader re if (p.Length == 2) { settings.Add(new KeyValuePair(GetAssemblyName(p[0]), GetAssemblyName(p[1]))); - } + } else if (p.Length > 2) { string v = line.Substring(p[0].Length + 1, line.Length - p[0].Length - 1); @@ -198,7 +219,7 @@ private static void AddConfiguration(IConfigurationBuilder conf, StreamReader re { var e = new TangApplicationException("Config data is not in format of KeyValuePair: " + line); Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(e, LOGGER); - } + } } ProcessConfigData(conf, settings); } @@ -275,13 +296,13 @@ public static void ProcessConfigData(IConfigurationBuilder conf, IDictionary 0) + if (types.Length > 0) { sb.Append(types[0].GetType()); - for (int i = 1; i < types.Length; i++) + for (int i = 1; i < types.Length; i++) { sb.Append(sep).Append(types[i].GetType()); } diff --git a/lang/cs/Org.Apache.REEF.Tang/Implementations/Configuration/ConfigurationBuilderImpl.cs b/lang/cs/Org.Apache.REEF.Tang/Implementations/Configuration/ConfigurationBuilderImpl.cs index 4eab9a1e8a..290d6fb8a3 100644 --- a/lang/cs/Org.Apache.REEF.Tang/Implementations/Configuration/ConfigurationBuilderImpl.cs +++ b/lang/cs/Org.Apache.REEF.Tang/Implementations/Configuration/ConfigurationBuilderImpl.cs @@ -44,7 +44,7 @@ internal class ConfigurationBuilderImpl : IConfigurationBuilder public const string DuplicatedEntryForNamedParamater = "Duplicated entries: "; private static readonly Logger LOGGER = Logger.GetLogger(typeof(ConfigurationBuilderImpl)); - protected ConfigurationBuilderImpl() + protected ConfigurationBuilderImpl() { this.ClassHierarchy = TangFactory.GetTang().GetDefaultClassHierarchy(); } @@ -57,28 +57,28 @@ public ConfigurationBuilderImpl(IClassHierarchy classHierarchy) protected ConfigurationBuilderImpl(string[] assemblies, IConfiguration[] confs, Type[] parsers) { this.ClassHierarchy = TangFactory.GetTang().GetDefaultClassHierarchy(assemblies, parsers); - foreach (IConfiguration tc in confs) + foreach (IConfiguration tc in confs) { if (tc == null) { throw new ArgumentNullException("One of specified configurations is null"); - } - + } + AddConfiguration((ConfigurationImpl)tc); } } - public ConfigurationBuilderImpl(ConfigurationBuilderImpl t) + public ConfigurationBuilderImpl(ConfigurationBuilderImpl t) { this.ClassHierarchy = t.GetClassHierarchy(); - try + try { AddConfiguration(t.GetClassHierarchy(), t); - } - catch (BindException e) + } + catch (BindException e) { Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Caught(e, Level.Error, LOGGER); - Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(new IllegalStateException("Could not copy builder", e), LOGGER); + Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(new IllegalStateException("Could not copy builder", e), LOGGER); } } @@ -98,20 +98,20 @@ public void AddConfiguration(IConfiguration conf) private void AddConfiguration(IClassHierarchy ns, ConfigurationBuilderImpl builder) { this.ClassHierarchy = this.ClassHierarchy.Merge(ns); - - if (ClassHierarchy is ClassHierarchyImpl || builder.ClassHierarchy is ClassHierarchyImpl) + + if (ClassHierarchy is ClassHierarchyImpl || builder.ClassHierarchy is ClassHierarchyImpl) { if (ClassHierarchy is ClassHierarchyImpl && builder.ClassHierarchy is ClassHierarchyImpl) { ((ClassHierarchyImpl)ClassHierarchy).Parameterparser.MergeIn(((ClassHierarchyImpl)builder.ClassHierarchy).Parameterparser); - } - else + } + else { Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(new ArgumentException("Attempt to merge Java and non-Java class hierarchy! Not supported."), LOGGER); } } - foreach (IClassNode cn in builder.BoundImpls.Keys) + foreach (IClassNode cn in builder.BoundImpls.Keys) { IClassNode n = null; builder.BoundImpls.TryGetValue(cn, out n); @@ -121,7 +121,7 @@ private void AddConfiguration(IClassHierarchy ns, ConfigurationBuilderImpl build } } - foreach (IClassNode cn in builder.BoundConstructors.Keys) + foreach (IClassNode cn in builder.BoundConstructors.Keys) { IClassNode n = null; builder.BoundConstructors.TryGetValue(cn, out n); @@ -134,32 +134,32 @@ private void AddConfiguration(IClassHierarchy ns, ConfigurationBuilderImpl build // The namedParameters set contains the strings that can be used to // instantiate new // named parameter instances. Create new ones where we can. - foreach (INamedParameterNode np in builder.NamedParameters.Keys) + foreach (INamedParameterNode np in builder.NamedParameters.Keys) { string v = null; builder.NamedParameters.TryGetValue(np, out v); Bind(np.GetFullName(), v); } - - foreach (IClassNode cn in builder.LegacyConstructors.Keys) + + foreach (IClassNode cn in builder.LegacyConstructors.Keys) { IConstructorDef cd = null; builder.LegacyConstructors.TryGetValue(cn, out cd); - RegisterLegacyConstructor(cn, cd.GetArgs()); + RegisterLegacyConstructor(cn, cd.GetArgs()); } - foreach (KeyValuePair e in builder.BoundSetEntries) + foreach (KeyValuePair e in builder.BoundSetEntries) { string name = ((INamedParameterNode)e.Key).GetFullName(); - if (e.Value is INode) + if (e.Value is INode) { BindSetEntry(name, (INode)e.Value); - } - else if (e.Value is string) + } + else if (e.Value is string) { BindSetEntry(name, (string)e.Value); - } - else + } + else { var ex = new IllegalStateException(string.Format(CultureInfo.CurrentCulture, "The value {0} set to the named parameter {1} is illegel.", e.Value, name)); Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(ex, LOGGER); @@ -266,16 +266,20 @@ public void Bind(Types.INode key, Types.INode value) public void BindParameter(INamedParameterNode name, string value) { /* Parse and discard value; this is just for type checking, skip for now*/ - if (this.ClassHierarchy is ICsClassHierarchy) + if (this.ClassHierarchy is ICsClassHierarchy) { ((ICsClassHierarchy)ClassHierarchy).Parse(name, value); } - if (name.IsSet()) + if (name.IsSet()) { BindSetEntry((INamedParameterNode)name, value); - } - else + } + else if (name.IsList()) + { + BindList((INamedParameterNode)name, value); + } + else { try { @@ -289,6 +293,17 @@ public void BindParameter(INamedParameterNode name, string value) } } + public void BindList(INamedParameterNode iface, string impl) + { + IList l; + if (!BoundLists.TryGetValue(iface, out l)) + { + l = new List(); + BoundLists.Add(iface, l); + } + l.Add((object)impl); + } + public void BindImplementation(IClassNode n, IClassNode m) { if (this.ClassHierarchy.IsImplementation(n, m)) @@ -337,6 +352,11 @@ public void BindList(INamedParameterNode iface, IList impl) IList l = new List(); foreach (var n in impl) { + if (string.IsNullOrEmpty(n)) + { + throw new ArgumentException("List cannot contain string that are null or empty"); + } + l.Add((object)n); } BoundLists.Add(iface, l); @@ -344,7 +364,7 @@ public void BindList(INamedParameterNode iface, IList impl) public void BindList(string iface, IList impl) { - BindList((INamedParameterNode)ClassHierarchy.GetNode(iface), impl); + BindList((INamedParameterNode)ClassHierarchy.GetNode(iface), impl); } public void BindList(string iface, IList impl) @@ -387,4 +407,4 @@ public string ClassPrettyDescriptionString(string fullName) return param.GetDocumentation() + "\n" + param.GetFullName(); } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Tang/Implementations/Configuration/CsConfigurationBuilderImpl.cs b/lang/cs/Org.Apache.REEF.Tang/Implementations/Configuration/CsConfigurationBuilderImpl.cs index 569d4670a1..9f94ab1266 100644 --- a/lang/cs/Org.Apache.REEF.Tang/Implementations/Configuration/CsConfigurationBuilderImpl.cs +++ b/lang/cs/Org.Apache.REEF.Tang/Implementations/Configuration/CsConfigurationBuilderImpl.cs @@ -49,7 +49,7 @@ public CsConfigurationBuilderImpl(ICsClassHierarchy classHierarchy) : base(classHierarchy) { } - + public CsConfigurationBuilderImpl(string[] assemblies) : base(assemblies) { @@ -225,13 +225,13 @@ public ICsConfigurationBuilder BindList(GenericType iface, IList(GenericType iface, IList impl) where U : Name> { - return ((ICsInternalConfigurationBuilder)this).BindList(typeof(U), impl); + return ((ICsInternalConfigurationBuilder)this).BindList(typeof(U), impl); } public ICsConfigurationBuilder BindList(Type iface, IList implList) @@ -399,7 +399,15 @@ ICsInternalConfigurationBuilder ICsInternalConfigurationBuilder.BindList(Type if Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(ex, LOGGER); } - BindList((INamedParameterNode)n, implList); + try + { + BindList((INamedParameterNode)n, implList); + } + catch (ArgumentException ex) + { + throw new BindException($"BindList failed to bind for {iface.Name}, reason: {ex.Message}"); + } + return this; } @@ -433,7 +441,7 @@ ICsInternalConfigurationBuilder ICsInternalConfigurationBuilder.BindConstructor( #endregion ICsInternalConfigurationBuilder #region extension methods - + public ICsConfigurationBuilder BindNamedParam(string str) where TName : Name { return BindNamedParameter(GenericType.Class, str); diff --git a/lang/cs/Org.Apache.REEF.Utilities/ByteUtilities.cs b/lang/cs/Org.Apache.REEF.Utilities/ByteUtilities.cs index 0e21453c97..4e3212f32e 100644 --- a/lang/cs/Org.Apache.REEF.Utilities/ByteUtilities.cs +++ b/lang/cs/Org.Apache.REEF.Utilities/ByteUtilities.cs @@ -49,6 +49,14 @@ public static string ByteArraysToString(byte[] b) return Encoding.UTF8.GetString(b); } + /// + /// Converts from a UTF-8 encoded byte array to a string. + /// + public static string ByteArraysToString(byte[] b, int start, int length) + { + return Encoding.UTF8.GetString(b, start, length); + } + /// /// Performs a deep copy of a byte array. /// diff --git a/pom.xml b/pom.xml index 3d185e8ef8..ac2db38e5f 100644 --- a/pom.xml +++ b/pom.xml @@ -60,7 +60,7 @@ under the License. 2.17 1.20.0 6.17 - 3.0.2 + 3.0.5 0.9.9-RC1 3.0.1 3.0.3