#!/usr/bin/env python import gym import numpy as np import tensorflow as tf import tensorflow.keras.layers as klayers import tensorflow.keras.losses as klosses import tensorflow.keras.optimizers as koptimizers import matplotlib.pyplot as plt from common.multiprocessing_env import SubprocVecEnv ############################################### # Create the Pendulum-v0 environment with gym ################################################# num_envs = 8 env_name = "Pendulum-v1" def make_env(): def _thunk(): env = gym.make(env_name) return env return _thunk ####################################################### # Define a layer of normal distribution ########################################################## class ProbabilityDistribution(tf.keras.layers.Layer): def __init__(self, name): super(ProbabilityDistribution, self).__init__(name=name) self.log_std=self.add_weight(name='std', shape=(1,1), trainable=True) def call(self, inputs): action_mean = inputs[0] action=inputs[1] self_std=tf.math.exp(self.log_std) #log probability for the action for given action mean log_prob=-(tf.square(tf.subtract(action,action_mean)))/(2*self_std**2) -tf.math.log(self_std) -0.5*np.log(2*np.pi) #Calculate the entropy of normal distribution k = np.sqrt(2 * np.pi) entropy=0.5 + tf.math.log(k * self_std) ones_= tf.ones_like(log_prob) return log_prob, entropy*ones_ def get_std(self): return self.log_std ########################################################################## # Define an actor-critic architecture containing policy and value networks # as well as sampling operation from a normal distribution and the surrogate # loss for the PPO algorithm ########################################################################## class ActorCritic: def __init__(self,ob_shape, ac_shape, hidden_size,eps,ent_coeff,name): self.hidden_size=hidden_size self.eps=eps self.ent_coeff=ent_coeff #define inputs ob_input=klayers.Input(shape=ob_shape, name='ob_input') ac_input = klayers.Input(shape=ac_shape,name='ac_input') logpi_old_input=klayers.Input(shape=ac_shape, name='logpi_old_input') advs_input=klayers.Input(shape=(1,), name='advs_input') #define custom layers self.dist=ProbabilityDistribution(name='normal_distribution') self.comp_loss =klayers.Lambda(self.comp_actor_loss) self.random_normal=klayers.Lambda(self.comp_random_normal) #Define the base model v_h1= klayers.Dense(self.hidden_size, activation='relu')(ob_input) value= klayers.Dense(1, activation=None)(v_h1) a_h1= klayers.Dense(self.hidden_size, activation='relu')(ob_input) a_out= klayers.Dense(1, activation=None)(a_h1) action_sample=self.random_normal(a_out) self.model_base=tf.keras.Model(inputs=ob_input, outputs=[a_out, action_sample, value], name=name+'_base') #define the probability model [a_out, _, value_out] = self.model_base(ob_input) log_prob, entropy = self.dist([a_out, ac_input]) self.model_prob=tf.keras.Model(inputs=[ob_input,ac_input], outputs=[log_prob, entropy, value_out], name=name+'_prob') #define the final model ac_input_ = klayers.Input(shape=ac_shape, name='ac_input_') ob_input_ = klayers.Input(shape=ob_shape, name='ob_input_') [logpi, entropy_, value_] = self.model_prob([ob_input_, ac_input_]) actor_loss_=self.comp_loss([logpi,logpi_old_input,advs_input,entropy_]) self.model_final=tf.keras.Model(inputs=[ob_input_,ac_input_,logpi_old_input, advs_input],outputs=[actor_loss_,value_], name=name+'_final') def comp_actor_loss(self,tensor): logpi=tensor[0] logpi_old_input=tensor[1] advs_input=tensor[2] entropy_=tensor[3] ratio = tf.exp(logpi - logpi_old_input) surr = ratio * advs_input actor_loss = tf.minimum(surr, tf.clip_by_value(ratio, 1 - self.eps, 1 + self.eps) * advs_input) actor_loss_=-actor_loss - self.ent_coeff * tf.reduce_mean(entropy_) return actor_loss_ def comp_random_normal(self, a_out): action_sample = tf.random.normal(shape=(1, 1), mean=a_out, stddev=tf.math.exp(self.dist.get_std())) return action_sample def get_models(self): return(self.model_base, self.model_prob, self.model_final) ######################################################################## # Implements generalized advantage estimate (GAE). # This is better version of advantage ######################################################################## def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95): values = values + [next_value] gae = 0 returns = [] for step in reversed(range(len(rewards))): delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step] gae = delta + gamma * tau * masks[step] * gae returns.insert(0, gae + values[step]) return returns ################################################################################# # Proximal Policy Optimization algorithm, on top of the Actor-Critic architecture # ############################################################################ class PPO: def __init__(self, ob_shape, ac_shape, lr, hidden_size, eps=0.2, v_coeff=0.5, ent_coeff=0.01): self.ob_shape = ob_shape self.ac_shape = ac_shape self.lr = lr self.hidden_size = hidden_size self.eps = eps self.v_coeff = v_coeff self.ent_coeff = ent_coeff # current actor critic actor_critic=ActorCritic(self.ob_shape, self.ac_shape, self.hidden_size,self.eps, self.ent_coeff, name='pi') self.pi_base, self.pi_prob, self.pi_final=actor_critic.get_models() # Non trainable old actor critic actor_critic_old = ActorCritic(self.ob_shape, self.ac_shape, self.hidden_size,self.eps, self.ent_coeff, name='pi_old') self.pi_base_old, self.pi_prob_old, _ = actor_critic_old.get_models() self.pi_base.summary() self.pi_prob.summary() self.pi_final.summary() #compile the model self.pi_final.compile( optimizer=koptimizers.Adam(learning_rate=self.lr), # define separate losses for policy logits and value estimate loss=[self._action_loss, self._value_loss] ) def _action_loss(self,true_val, pred_val ): return tf.reduce_mean(pred_val) def _value_loss(self, returns, value): return self.v_coeff*klosses.mean_squared_error(returns, value) def get_action(self, obs): _,action_sample,_= self.pi_base.predict(obs) return action_sample def get_value(self, obs): _, _, value = self.pi_base.predict(obs) return value def assign_old_pi(self): self.pi_prob_old.set_weights(self.pi_prob.get_weights()) # Train the PPO actor critic network with a single batch def update(self, obs, acs, returns, advs): old_pi_prob,_,_=self.pi_prob_old.predict([obs,acs]) self.pi_final.train_on_batch(x=[obs,acs,old_pi_prob, advs],y=[np.ones_like(returns), returns]) ############################################################ # Generates data for the PPO iterations ########################################################### def ppo_iter(mini_batch_size, obs, acs, returns, advantage): batch_size = obs.shape[0] for _ in range(batch_size // mini_batch_size): rand_ids = np.random.randint(0, batch_size, mini_batch_size) yield (obs[rand_ids, :], acs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]) ############################################################### # Test run the model with a random starting point # and generates state action pairs and the corresponding rewards ################################################################ def test_env(model, vis=False): ob = env.reset() done = False total_reward = 0 while not done: if vis: env.render() ac = model.get_action(np.expand_dims(ob, axis=0)) next_ob, reward, done, _ = env.step(ac) ob = np.squeeze(next_ob) total_reward += reward return total_reward ####################################################### # Load the expert trajectories. # Expert trajectories are the state-action pairs recorded when a human performs the control task. # In this task state is 3 dimensional vector (sin_theta, cos_theta and theta_dot) # while action is a scalar ( ######################################################### try: expert_traj = np.load("expert_traj.npy") except: print("Train, generate and save expert trajectories using ppo algorithm first") assert False ############################################################### # Implements the discriminator class # ############################################################# class Discriminator: def __init__(self, ob_shape, ac_shape, hidden_size,batch_size, lr, name='discriminator'): super(Discriminator, self).__init__() self.ob_shape = ob_shape self.ac_shape = ac_shape self.hidden_size = hidden_size self.batch_size=batch_size self.lr = lr self.name = name self.ones=tf.ones([self.batch_size,1]) self.zeros=tf.zeros([self.batch_size,1]) self.zeros_ones=tf.concat([self.zeros, self.ones], axis=0) self.ones_zeros=tf.concat([self.ones, self.zeros], axis=0) # Defined the combined observation-action input. i.e. first ob_shape[0] elements of each row is the observation # vector and the rest (ac_shape[0] elements) is the action vector. ob_ac=klayers.Input(shape=[ ob_shape[0] + ac_shape[0]]) # Base part of the discrimator network, d_h1=klayers.Dense(self.hidden_size, activation='tanh', name='dense1')(ob_ac) d_h2 = klayers.Dense(self.hidden_size, activation='tanh', name='dense2')(d_h1) d_out = klayers.Dense(1, activation=None, name='dense3')(d_h2) # discriminator network outputs a reward which is the log probability of the network output # for a trajectory input (ie. policy observation-action input) reward = - tf.squeeze(tf.math.log(tf.sigmoid(d_out))) #Model to create the reward from observation/action pairs self.model_reward=tf.keras.Model(inputs=ob_ac, outputs=reward) #Model to train the discriminator self.model_prob=tf.keras.Model(inputs=ob_ac, outputs=d_out) self.model_prob.compile(optimizer=koptimizers.Adam(learning_rate=self.lr), loss=[klosses.BinaryCrossentropy(from_logits=True), klosses.BinaryCrossentropy(from_logits=True)]) def get_reward(self, _ob_ac): reward=self.model_reward.predict(_ob_ac) return reward def update(self, all_ob_ac): self.model_prob.train_on_batch(all_ob_ac,FILL_HERE) ###COMPLETE THIS LINE ####################################################### # Gail training procedure starts from here # First define the training hyper-parameters and then the # training procedure itself. ####################################################### ############# Hyper parameters ######################## ppo_hidden_size = 256 discriminator_hidden_size = 128 lr = 3e-4 num_steps = 20 mini_batch_size = 5 ppo_epochs = 4 threshold_reward = -200 max_frames = 20000 frame_idx = 0 plot_interval=1000 # Set this to a low value such as 10 to verify that the code responds test_rewards = [] if __name__ == '__main__': envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) env = gym.make(env_name) ############# Training procedure ########################### ob_shape = list(envs.observation_space.shape) ac_shape = list(envs.action_space.shape) ob = envs.reset() early_stop = False config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config)) ppo = PPO( ob_shape, ac_shape, lr, ppo_hidden_size) discriminator = Discriminator( ob_shape, ac_shape, discriminator_hidden_size, num_envs*num_steps, lr,) i_update = 0 state = envs.reset() early_stop = False fig,ax = plt.subplots(1,1) ax.set_xlabel('frame_index/plot_interval') ; ax.set_ylabel('reward') while frame_idx < max_frames and not early_stop: i_update += 1 values = [] obs = [] acs = [] rewards = [] masks = [] entropy = 0 for _ in range(num_steps): ac = ppo.get_action(ob) next_ob, _, done, _ = envs.step(ac) reward = discriminator.get_reward(np.concatenate([ob, ac], axis=1)) value = ppo.get_value(ob) values.append(value) rewards.append(reward[:, np.newaxis]) masks.append((1-done)[:, np.newaxis]) obs.append(ob) acs.append(ac) ob = next_ob frame_idx += 1 if frame_idx % plot_interval == 0: test_reward = np.mean([test_env(ppo) for _ in range(10)]) test_rewards.append(test_reward) ax.plot(range(int(frame_idx / plot_interval)), test_rewards) fig.canvas.draw() fig.canvas.flush_events() plt.pause(0.1) if test_reward > threshold_reward: early_stop = True next_value = ppo.get_value(next_ob) returns = compute_gae(next_value, rewards, masks, values) returns = np.concatenate(returns) values = np.concatenate(values) obs = np.concatenate(obs) acs = np.concatenate(acs) advantages = returns - values # Policy Update if i_update % 3 == 0: ppo.assign_old_pi() for _ in range(ppo_epochs): for ob_batch, ac_batch, return_batch, adv_batch in ppo_iter(mini_batch_size, obs, acs, returns, advantages): ppo.update(ob_batch, ac_batch, return_batch, adv_batch) # Discriminator Update expert_ob_ac = expert_traj[np.random.randint(0, expert_traj.shape[0], num_steps * num_envs), :] policy_ob_ac = np.concatenate([obs, acs], 1) discriminator.update(np.concatenate([expert_ob_ac, policy_ob_ac], axis=0)) print(test_rewards)