Skip to content

Commit ed2cb91

Browse files
committed
First commit.
1 parent 3c77f27 commit ed2cb91

File tree

13 files changed

+695
-0
lines changed

13 files changed

+695
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
__pycache__/
2+
results/

README.md

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,50 @@
11
# ppo-pytorch
22
PPO in pytorch version.
3+
4+
We run multiple episodes with the same policy, and create an experience replay buffer out of trajectories in these episodes to perform on-policy policy gradient updates using PPO. We clear the replay buffer from the last run before we start another run of multiple episodes.
5+
6+
## Set up Python environment
7+
Run
8+
```
9+
virtualenv -p /usr/bin/python3 ppoenv
10+
source ppoenv/bin/activate
11+
pip install -r requirements.txt
12+
```
13+
or
14+
```
15+
virtualenv -p /usr/bin/python3 ppoenv
16+
source ppoenv/bin/activate
17+
pip install gym==0.18.0
18+
pip install torch
19+
pip install tqdm
20+
pip install tensorboard
21+
```
22+
23+
## Train and evaluate agent in RL (cartpole).
24+
```
25+
source ppoenv/bin/activate
26+
python train.py
27+
```
28+
Check training progress by running
29+
```
30+
source ppoenv/bin/activate
31+
tensorboard --logdir results/
32+
```
33+
After training is complete, find `[SAVED_LOG]` in `results/` (e.g., `20221023_172239`). To evaluate without visualization, run
34+
```
35+
source ppoenv/bin/activate
36+
python eval.py --log [SAVED_LOG]
37+
```
38+
To evaluate with visualization, run
39+
```
40+
source ppoenv/bin/activate
41+
python eval.py --log [SAVED_LOG] --visualize
42+
```
43+
If you want to evaluate on a checkpoint at a specific episode (e.g., 1000), run
44+
```
45+
source ppoenv/bin/activate
46+
python eval.py --log [SAVED_LOG] --visualize --training_episodes 1000
47+
```
48+
49+
## Credits
50+
Borrowed code from [ikostrikov/pytorch-a2c-ppo-acktr-gail](https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail), [vita-epfl/CrowdNav](https://github.com/vita-epfl/CrowdNav), and [agrimgupta92/sgan](https://github.com/agrimgupta92/sgan).

arg_parse.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import argparse
2+
3+
def arg_parse():
4+
parser = argparse.ArgumentParser()
5+
# gym environment hyperparameters
6+
parser.add_argument('--state_size', type=int, default=4)
7+
parser.add_argument('--action_size', type=int, default=2)
8+
# rollout hyperparameters
9+
parser.add_argument('--num_episodes_per_run', type=int, default=100)
10+
# policy hyperparameters
11+
parser.add_argument('--policy_model', type=str, default='actor-critic', help='Policy options: actor-critic.')
12+
parser.add_argument('--model_embedding_size', type=int, default=64)
13+
parser.add_argument('--model_hidden_size', type=int, default=128)
14+
parser.add_argument('--model_dropout', type=float, default=0.1)
15+
parser.add_argument('--num_episodes_per_checkpoint', type=int, default=100)
16+
# memory hyperparameters
17+
parser.add_argument('--experience_replay', action='store_true', default=False)
18+
parser.add_argument('--memory_capacity', type=int, default=100000, help='typically equals num_episodes_per_run * step_limit')
19+
# training hyperparameters
20+
parser.add_argument('--training_episodes', type=int, default=2000, help='equals num_episodes_per_run * num_runs')
21+
# ppo parameters
22+
parser.add_argument('--lr', type=float, default=1e-4)
23+
parser.add_argument('--batch_size', type=int, default=64)
24+
parser.add_argument('--clip_param', type=float, default=0.2)
25+
parser.add_argument('--ppo_epoch', type=int, default=4)
26+
parser.add_argument('--clip_grad', type=float, default=0.5)
27+
parser.add_argument('--value_loss_coef', type=float, default=1.)
28+
parser.add_argument('--entropy_coef', type=float, default=0.01)
29+
parser.add_argument('--use_clipped_value_loss', action='store_true')
30+
# evaluation inputs
31+
parser.add_argument('--log', default=None)
32+
parser.add_argument('--visualize', action='store_true')
33+
return parser.parse_args()

eval.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from os.path import join, exists
2+
3+
import gym
4+
import torch
5+
6+
from arg_parse import arg_parse
7+
from src.policy import Policy
8+
from src.rollout import Rollout
9+
from src.experience_memory import ExperienceMemory
10+
11+
12+
def main():
13+
##### Initialization Phase #####
14+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
15+
print(device)
16+
args = arg_parse()
17+
experience_memory = ExperienceMemory(args)
18+
env = gym.make('CartPole-v0')
19+
policy = Policy(args, device=device)
20+
rollout = Rollout(env, policy, experience_memory=experience_memory)
21+
if args.log is None:
22+
raise RuntimeError("The log for evaluation is None.")
23+
logdir = join('results', args.log)
24+
checkpoint_filepath = join(logdir, 'model_weights', str(args.training_episodes)+'.pt')
25+
if not exists(checkpoint_filepath):
26+
raise RuntimeError(checkpoint_filepath+" does not exist.")
27+
checkpoint = torch.load(checkpoint_filepath, map_location=device)
28+
policy.load_model_weights(checkpoint)
29+
print("model weights are loaded.")
30+
if args.visualize:
31+
rollout_mode = 'visualize'
32+
else:
33+
rollout_mode = 'test'
34+
_, _, _, accumulative_reward = rollout.run(args, rollout_mode=rollout_mode)
35+
print("accumulative reward: ", accumulative_reward)
36+
env.close()
37+
38+
if __name__ == "__main__":
39+
main()

log.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Log
2+
3+
## 221023
4+
5+
- [Todo] Improve coding in `src/experience_memory.py`.
6+
```
7+
ppo-pytorch/src/experience_memory.py:82: [TodoserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:201.)
8+
```

requirements.txt

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
absl-py==1.3.0
2+
cachetools==4.2.4
3+
certifi==2022.9.24
4+
charset-normalizer==2.0.12
5+
cloudpickle==1.6.0
6+
dataclasses==0.8
7+
future==0.18.2
8+
google-auth==2.13.0
9+
google-auth-oauthlib==0.4.6
10+
grpcio==1.48.2
11+
gym==0.18.0
12+
idna==3.4
13+
importlib-metadata==4.8.3
14+
importlib-resources==5.4.0
15+
Markdown==3.3.7
16+
numpy==1.19.5
17+
oauthlib==3.2.2
18+
Pillow==7.2.0
19+
protobuf==3.19.6
20+
pyasn1==0.4.8
21+
pyasn1-modules==0.2.8
22+
pyglet==1.5.0
23+
requests==2.27.1
24+
requests-oauthlib==1.3.1
25+
rsa==4.9
26+
scipy==1.5.4
27+
six==1.16.0
28+
tdqm==0.0.1
29+
tensorboard==2.10.1
30+
tensorboard-data-server==0.6.1
31+
tensorboard-plugin-wit==1.8.1
32+
torch==1.10.2
33+
tqdm==4.64.1
34+
typing-extensions==4.1.1
35+
urllib3==1.26.12
36+
Werkzeug==2.0.3
37+
zipp==3.6.0

src/experience_memory.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
import torch
2+
from torch.utils.data import Dataset
3+
4+
class ExperienceMemory(Dataset):
5+
"""
6+
reference: https://github.com/vita-epfl/CrowdNav/blob/503173b836d5460e30234df7e14a7c67ee0ebfc7/crowd_nav/utils/memory.py#L4
7+
ExperienceMemory can be used to store rollouts for policy update.
8+
"""
9+
def __init__(self, args):
10+
self.args = args
11+
self.capacity = self.args.memory_capacity
12+
self.reset()
13+
14+
15+
def __len__(self):
16+
for k in self.memory.keys():
17+
assert self.length == len(self.memory[k])
18+
return self.length
19+
20+
21+
def push(self, experience):
22+
"""push the newest in and the oldest out. works in experience replay setting."""
23+
assert self.memory.keys() == experience.keys()
24+
experience_length = len(list(experience.values())[0])
25+
for i in range(experience_length):
26+
if self.length < self.position + 1: # before capacity is filled.
27+
self.length += 1
28+
for k in self.memory.keys():
29+
self.memory[k].append(experience[k][i])
30+
else: # after capacity is filled, replace old experience with newer ones.
31+
for k in self.memory.keys():
32+
self.memory[k][self.position] = experience[k][i]
33+
self.position = (self.position + 1) % self.capacity
34+
35+
36+
def concatenate(self, experience):
37+
"""concatenate with new memory (list). This means we don't do experience replay.
38+
If we do concatenation, we must assure the concatenated won't exceed the capacity."""
39+
assert self.memory.keys() == experience.keys()
40+
experience_length = len(list(experience.values())[0])
41+
if self.length + experience_length > self.capacity:
42+
raise RuntimeError("Adding new experience exceeds the memory capacity.")
43+
for k in self.memory.keys():
44+
self.memory[k] = self.memory[k] + experience[k]
45+
self.length += experience_length
46+
self.position += experience_length # sync of self.position with self.length when not reach capacity.
47+
48+
def is_full(self):
49+
return self.length == self.capacity
50+
51+
52+
def reset(self):
53+
self.memory = {}
54+
self.position = 0
55+
self.length = 0
56+
if self.args.policy_model == 'actor-critic':
57+
self.memory_keys = ['states', 'actions', 'rewards', 'values', 'returns', \
58+
'advantages', 'action_log_probs'] # ordered keys
59+
for k in self.memory_keys:
60+
self.memory[k] = []
61+
else:
62+
raise RuntimeError("Policy model is not supported.")
63+
self.update_tensor()
64+
65+
def update_tensor(self):
66+
"""
67+
list2tensor.
68+
Called in reset to create empty tensors, or
69+
Called after self.memory (which is a dict of list) is updated from rollouts.
70+
outputs:
71+
- memory_tensor: dict of tensors.
72+
- ['states']: (data_len, 5)
73+
- ['actions']: (data_len,)
74+
- ['rewards']: (data_len,)
75+
- ['values']: (data_len,)
76+
- ['returns']: (data_len,)
77+
- ['advantages']: (data_len,)
78+
- ['action_log_probs']: (data_len,)
79+
"""
80+
self.memory_tensor = {}
81+
for k in self.memory.keys():
82+
self.memory_tensor[k] = torch.Tensor(self.memory[k]) # float32
83+
84+
85+
def __getitem__(self, index):
86+
"""
87+
if self.args.policy_model == 'actor-critic':
88+
return a list in the following order:
89+
- states[index]
90+
- actions[index]
91+
- rewards[index]
92+
- values[index]
93+
- returns[index]
94+
- advantages[index]
95+
- action_log_probs[index] # all are not .to(device) yet
96+
"""
97+
out = []
98+
for k in self.memory_keys: # ordered keys
99+
out.append(self.memory_tensor[k][index])
100+
return out

src/model/actor_critic.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import torch.nn as nn
2+
import torch.nn.functional as F
3+
4+
from src.model.utils import make_mlp
5+
6+
7+
class ActorCritic(nn.Module):
8+
"""Actor Critic."""
9+
def __init__(self, state_size, action_size, \
10+
embedding_size=64, hidden_size=128, dropout=0.1):
11+
super(ActorCritic, self).__init__()
12+
# spatial embeddings
13+
self.actor_embedding = make_mlp([state_size, hidden_size, embedding_size], \
14+
batchnorm=True, activation='relu', dropout=dropout)
15+
self.critic_embedding = make_mlp([state_size, hidden_size, embedding_size], \
16+
batchnorm=True, activation='relu', dropout=dropout)
17+
# encoder
18+
self.actor_encoder = make_mlp([embedding_size, hidden_size], \
19+
batchnorm=True, activation='relu', dropout=dropout)
20+
self.critic_encoder = make_mlp([embedding_size, hidden_size], \
21+
batchnorm=True, activation='relu', dropout=dropout)
22+
self.actor_fc = nn.Linear(hidden_size, action_size)
23+
self.critic_fc = nn.Linear(hidden_size, 1)
24+
25+
26+
def forward(self, x):
27+
"""
28+
inputs:
29+
- x
30+
# input data.
31+
# tensor. size: (batch_size, state_size)
32+
outputs:
33+
- action_prob
34+
# probability of actions in the stochastic policy.
35+
# tensor. size: (batch_size, action_size)
36+
- state_value
37+
# values of state from critic.
38+
# tensor. size: (batch_size, )
39+
"""
40+
xa = self.actor_embedding(x)
41+
action_prob = F.softmax(self.actor_fc(self.actor_encoder(xa)), dim=-1) # (batch_size, action_size)
42+
xc = self.critic_embedding(x)
43+
state_value = self.critic_fc(self.critic_encoder(xc)).squeeze(-1) # (batch_size, )
44+
return action_prob, state_value

src/model/utils.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import torch.nn as nn
2+
3+
def make_mlp(dim_list, batchnorm=False, activation='relu', dropout=0.):
4+
layers = []
5+
for dim_in, dim_out in zip(dim_list[:-1], dim_list[1:]):
6+
layers.append(nn.Linear(dim_in, dim_out))
7+
if batchnorm:
8+
layers.append(nn.BatchNorm1d(dim_out))
9+
if activation == 'relu':
10+
layers.append(nn.ReLU())
11+
elif activation == 'leakyrelu':
12+
layers.append(nn.LeakyReLU())
13+
if dropout > 0.:
14+
layers.append(nn.Dropout(p=dropout))
15+
return nn.Sequential(*layers)

0 commit comments

Comments
 (0)