cpnota
diff --git a/‎.pylintrc‎
Lines changed: 24 additions & 9 deletions b/‎.pylintrc‎
Lines changed: 24 additions & 9 deletions
diff --git a/‎.travis.yml‎
Lines changed: 2 additions & 3 deletions b/‎.travis.yml‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎all/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎all/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎all/agents/_agent.py‎
Lines changed: 2 additions & 4 deletions b/‎all/agents/_agent.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎all/agents/a2c.py‎
Lines changed: 3 additions & 3 deletions b/‎all/agents/a2c.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎all/agents/c51.py‎
Lines changed: 8 additions & 10 deletions b/‎all/agents/c51.py‎
Lines changed: 8 additions & 10 deletions
diff --git a/‎all/agents/ddpg.py‎
Lines changed: 3 additions & 3 deletions b/‎all/agents/ddpg.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎all/agents/ddqn.py‎
Lines changed: 4 additions & 4 deletions b/‎all/agents/ddqn.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎all/agents/dqn.py‎
Lines changed: 4 additions & 4 deletions b/‎all/agents/dqn.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎all/agents/ppo.py‎
Lines changed: 3 additions & 3 deletions b/‎all/agents/ppo.py‎
Lines changed: 3 additions & 3 deletions
@@ -423,23 +423,38 @@ function-naming-style=snake_case
 #function-rgx=
 
 # Good variable names which should always be accepted, separated by a comma.
-good-names=i,
+good-names=a,
+           b,
+           c,
+           d,
+           e,
+           f,
+           g,
+           h,
+           i,
            j,
            k,
-           ex,
-           Run,
+           l,
+           m,
+           n,
+           o,
+           p,
            q,
+           r,
+           s,
+           t,
+           u,
            v,
-           _,
+           w,
            x,
            y,
+           z
+           _,
            lr,
-           n,
-           t,
-           e,
-           u,
            kl,
-           ax
+           ax,
+           ex,
+           Run,
 
 # Include a hint for the correct naming format with invalid-name.
 include-naming-hint=no
 
@@ -1,13 +1,12 @@
 language: python
 python:
-  - "3.6"
+  - "3.7"
 branches:
   only:
   - master
   - develop
 install:
-  - pip install https://download.pytorch.org/whl/cpu/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl
-  - pip install torchvision
+  - pip install torch==1.5.1+cpu torchvision==0.6.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
   - pip install -q -e .["dev"]
 script:
   - make lint
 
@@ -1 +1,4 @@
 import all.nn
+from all.core import State, StateArray
+
+__all__ = ['nn', 'State', 'StateArray']
@@ -13,7 +13,7 @@ class Agent(ABC, Schedulable):
     """
 
     @abstractmethod
-    def act(self, state, reward):
+    def act(self, state):
         """
         Select an action for the current timestep and update internal parameters.
 
@@ -27,14 +27,13 @@ def act(self, state, reward):
 
         Args:
             state (all.environment.State): The environment state at the current timestep.
-            reward (torch.Tensor): The reward from the previous timestep.
 
         Returns:
             torch.Tensor: The action to take at the current timestep.
         """
 
     @abstractmethod
-    def eval(self, state, reward):
+    def eval(self, state):
         """
         Select an action for the current timestep in evaluation mode.
 
@@ -45,7 +44,6 @@ def eval(self, state, reward):
 
         Args:
             state (all.environment.State): The environment state at the current timestep.
-            reward (torch.Tensor): The reward from the previous timestep.
 
         Returns:
             torch.Tensor: The action to take at the current timestep.
 
@@ -53,14 +53,14 @@ def __init__(
         self._batch_size = n_envs * n_steps
         self._buffer = self._make_buffer()
 
-    def act(self, states, rewards):
-        self._buffer.store(self._states, self._actions, rewards)
+    def act(self, states):
+        self._buffer.store(self._states, self._actions, states.reward)
         self._train(states)
         self._states = states
         self._actions = self.policy.no_grad(self.features.no_grad(states)).sample()
         return self._actions
 
-    def eval(self, states, _):
+    def eval(self, states):
         return self.policy.eval(self.features.eval(states))
 
     def _train(self, next_states):
 
@@ -53,22 +53,20 @@ def __init__(
         self._action = None
         self._frames_seen = 0
 
-    def act(self, state, reward):
-        self.replay_buffer.store(self._state, self._action, reward, state)
+    def act(self, state):
+        self.replay_buffer.store(self._state, self._action, state)
         self._train()
         self._state = state
         self._action = self._choose_action(state)
         return self._action
 
-    def eval(self, state, _):
-        return self._best_actions(self.q_dist.eval(state))
+    def eval(self, state):
+        return self._best_actions(self.q_dist.eval(state)).item()
 
     def _choose_action(self, state):
         if self._should_explore():
-            return torch.randint(
-                self.q_dist.n_actions, (len(state),), device=self.q_dist.device
-            )
-        return self._best_actions(self.q_dist.no_grad(state))
+            return np.random.randint(0, self.q_dist.n_actions)
+        return self._best_actions(self.q_dist.no_grad(state)).item()
 
     def _should_explore(self):
         return (
@@ -77,8 +75,8 @@ def _should_explore(self):
         )
 
     def _best_actions(self, probs):
-        q_values = (probs * self.q_dist.atoms).sum(dim=2)
-        return torch.argmax(q_values, dim=1)
+        q_values = (probs * self.q_dist.atoms).sum(dim=-1)
+        return torch.argmax(q_values, dim=-1)
 
     def _train(self):
         if self._should_train():
 
@@ -54,14 +54,14 @@ def __init__(self,
         self._action = None
         self._frames_seen = 0
 
-    def act(self, state, reward):
-        self.replay_buffer.store(self._state, self._action, reward, state)
+    def act(self, state):
+        self.replay_buffer.store(self._state, self._action, state)
         self._train()
         self._state = state
         self._action = self._choose_action(state)
         return self._action
 
-    def eval(self, state, _):
+    def eval(self, state):
         return self.policy.eval(state)
 
     def _choose_action(self, state):
 
@@ -38,7 +38,7 @@ def __init__(self,
         self.q = q
         self.policy = policy
         self.replay_buffer = replay_buffer
-        self.loss = staticmethod(loss)
+        self.loss = loss
         # hyperparameters
         self.replay_start_size = replay_start_size
         self.update_frequency = update_frequency
@@ -49,14 +49,14 @@ def __init__(self,
         self._action = None
         self._frames_seen = 0
 
-    def act(self, state, reward):
-        self.replay_buffer.store(self._state, self._action, reward, state)
+    def act(self, state):
+        self.replay_buffer.store(self._state, self._action, state)
         self._train()
         self._state = state
         self._action = self.policy.no_grad(state)
         return self._action
 
-    def eval(self, state, _):
+    def eval(self, state):
         return self.policy.eval(state)
 
     def _train(self):
 
@@ -39,7 +39,7 @@ def __init__(self,
         self.q = q
         self.policy = policy
         self.replay_buffer = replay_buffer
-        self.loss = staticmethod(loss)
+        self.loss = loss
         # hyperparameters
         self.discount_factor = discount_factor
         self.minibatch_size = minibatch_size
@@ -50,14 +50,14 @@ def __init__(self,
         self._action = None
         self._frames_seen = 0
 
-    def act(self, state, reward):
-        self.replay_buffer.store(self._state, self._action, reward, state)
+    def act(self, state):
+        self.replay_buffer.store(self._state, self._action, state)
         self._train()
         self._state = state
         self._action = self.policy.no_grad(state)
         return self._action
 
-    def eval(self, state, _):
+    def eval(self, state):
         return self.policy.eval(state)
 
     def _train(self):
 
@@ -63,14 +63,14 @@ def __init__(
         self._batch_size = n_envs * n_steps
         self._buffer = self._make_buffer()
 
-    def act(self, states, rewards):
-        self._buffer.store(self._states, self._actions, rewards)
+    def act(self, states):
+        self._buffer.store(self._states, self._actions, states.reward)
         self._train(states)
         self._states = states
         self._actions = self.policy.no_grad(self.features.no_grad(states)).sample()
         return self._actions
 
-    def eval(self, states, _):
+    def eval(self, states):
         return self.policy.eval(self.features.eval(states))
 
     def _train(self, next_states):