#!/usr/bin/env python


import gym
import numpy as np

import tensorflow as tf
import tensorflow.keras.layers as klayers
import tensorflow.keras.losses as klosses
import tensorflow.keras.optimizers as koptimizers

import matplotlib.pyplot as plt

from common.multiprocessing_env import SubprocVecEnv


###############################################
# Create the Pendulum-v0 environment with gym
#################################################
num_envs = 8
env_name = "Pendulum-v1"

def make_env():
    def _thunk():
        env = gym.make(env_name)
        return env

    return _thunk


#######################################################
# Define a layer of normal distribution
##########################################################
class ProbabilityDistribution(tf.keras.layers.Layer):
    def __init__(self, name):
        super(ProbabilityDistribution, self).__init__(name=name)
        self.log_std=self.add_weight(name='std', shape=(1,1), trainable=True)


    def call(self, inputs):
        action_mean = inputs[0]
        action=inputs[1]
        self_std=tf.math.exp(self.log_std)
        #log probability for the action for given action mean
        log_prob=-(tf.square(tf.subtract(action,action_mean)))/(2*self_std**2) -tf.math.log(self_std) -0.5*np.log(2*np.pi)

        #Calculate the entropy of normal distribution
        k = np.sqrt(2 * np.pi)
        entropy=0.5 + tf.math.log(k * self_std)
        ones_= tf.ones_like(log_prob)

        return log_prob, entropy*ones_


    def get_std(self):
        return self.log_std


##########################################################################
# Define an actor-critic architecture containing policy and value networks
# as well as  sampling operation from a normal distribution and the surrogate
# loss for the PPO algorithm
##########################################################################
class ActorCritic:
    def __init__(self,ob_shape, ac_shape, hidden_size,eps,ent_coeff,name):
        self.hidden_size=hidden_size
        self.eps=eps
        self.ent_coeff=ent_coeff

        #define inputs
        ob_input=klayers.Input(shape=ob_shape, name='ob_input')
        ac_input = klayers.Input(shape=ac_shape,name='ac_input')
        logpi_old_input=klayers.Input(shape=ac_shape, name='logpi_old_input')
        advs_input=klayers.Input(shape=(1,), name='advs_input')

        #define custom layers
        self.dist=ProbabilityDistribution(name='normal_distribution')
        self.comp_loss =klayers.Lambda(self.comp_actor_loss)
        self.random_normal=klayers.Lambda(self.comp_random_normal)

        #Define the base model
        v_h1= klayers.Dense(self.hidden_size, activation='relu')(ob_input)
        value= klayers.Dense(1, activation=None)(v_h1)
        a_h1= klayers.Dense(self.hidden_size, activation='relu')(ob_input)
        a_out= klayers.Dense(1, activation=None)(a_h1)
        action_sample=self.random_normal(a_out)
        self.model_base=tf.keras.Model(inputs=ob_input, outputs=[a_out, action_sample, value], name=name+'_base')


        #define the probability model
        [a_out, _, value_out] = self.model_base(ob_input)
        log_prob, entropy = self.dist([a_out, ac_input])
        self.model_prob=tf.keras.Model(inputs=[ob_input,ac_input], outputs=[log_prob, entropy, value_out], name=name+'_prob')

        #define the  final model
        ac_input_ = klayers.Input(shape=ac_shape, name='ac_input_')
        ob_input_ = klayers.Input(shape=ob_shape, name='ob_input_')
        [logpi, entropy_, value_] = self.model_prob([ob_input_, ac_input_])
        actor_loss_=self.comp_loss([logpi,logpi_old_input,advs_input,entropy_])

        self.model_final=tf.keras.Model(inputs=[ob_input_,ac_input_,logpi_old_input, advs_input],outputs=[actor_loss_,value_], name=name+'_final')

    def comp_actor_loss(self,tensor):
        logpi=tensor[0]
        logpi_old_input=tensor[1]
        advs_input=tensor[2]
        entropy_=tensor[3]
        ratio = tf.exp(logpi - logpi_old_input)
        surr = ratio * advs_input
        actor_loss = tf.minimum(surr, tf.clip_by_value(ratio, 1 - self.eps, 1 + self.eps) * advs_input)
        actor_loss_=-actor_loss - self.ent_coeff * tf.reduce_mean(entropy_)
        return actor_loss_

    def comp_random_normal(self, a_out):
        action_sample = tf.random.normal(shape=(1, 1), mean=a_out, stddev=tf.math.exp(self.dist.get_std()))
        return action_sample

    def get_models(self):
        return(self.model_base, self.model_prob, self.model_final)


########################################################################
# Implements generalized advantage estimate (GAE).
# This is better version of advantage
########################################################################
def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95):
    values = values + [next_value]
    gae = 0
    returns = []
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
        gae = delta + gamma * tau * masks[step] * gae
        returns.insert(0, gae + values[step])
    return returns


#################################################################################
# Proximal Policy Optimization algorithm, on top of the Actor-Critic architecture
#
############################################################################
class PPO:
    def __init__(self,  ob_shape, ac_shape, lr, hidden_size, eps=0.2, v_coeff=0.5, ent_coeff=0.01):

        self.ob_shape = ob_shape
        self.ac_shape = ac_shape
        self.lr = lr
        self.hidden_size = hidden_size
        self.eps = eps
        self.v_coeff = v_coeff
        self.ent_coeff = ent_coeff

        # current actor critic
        actor_critic=ActorCritic(self.ob_shape, self.ac_shape, self.hidden_size,self.eps, self.ent_coeff, name='pi')
        self.pi_base, self.pi_prob, self.pi_final=actor_critic.get_models()

        # Non trainable old actor critic
        actor_critic_old = ActorCritic(self.ob_shape, self.ac_shape, self.hidden_size,self.eps, self.ent_coeff, name='pi_old')
        self.pi_base_old, self.pi_prob_old, _ = actor_critic_old.get_models()

        self.pi_base.summary()
        self.pi_prob.summary()
        self.pi_final.summary()

        #compile the model
        self.pi_final.compile(
            optimizer=koptimizers.Adam(learning_rate=self.lr),
            # define separate losses for policy logits and value estimate
            loss=[self._action_loss, self._value_loss]
        )


    def _action_loss(self,true_val, pred_val ):
        return tf.reduce_mean(pred_val)

    def _value_loss(self, returns, value):
        return self.v_coeff*klosses.mean_squared_error(returns, value)

    def get_action(self, obs):
        _,action_sample,_= self.pi_base.predict(obs)
        return action_sample

    def get_value(self, obs):
        _, _, value = self.pi_base.predict(obs)
        return value

    def assign_old_pi(self):
        self.pi_prob_old.set_weights(self.pi_prob.get_weights())

    # Train the  PPO actor critic network with a single batch
    def update(self, obs, acs, returns, advs):
        old_pi_prob,_,_=self.pi_prob_old.predict([obs,acs])
        self.pi_final.train_on_batch(x=[obs,acs,old_pi_prob, advs],y=[np.ones_like(returns), returns])


############################################################
#  Generates data for the PPO iterations
###########################################################
def ppo_iter(mini_batch_size, obs, acs, returns, advantage):
    batch_size = obs.shape[0]
    for _ in range(batch_size // mini_batch_size):
        rand_ids = np.random.randint(0, batch_size, mini_batch_size)
        yield (obs[rand_ids, :], acs[rand_ids, :],
               returns[rand_ids, :], advantage[rand_ids, :])


###############################################################
# Test run the model with a random starting point
# and generates state action pairs and the corresponding rewards
################################################################
def test_env(model, vis=False):
    ob = env.reset()
    done = False
    total_reward = 0
    while not done:
        if vis:
            env.render()
        ac = model.get_action(np.expand_dims(ob, axis=0))
        next_ob, reward, done, _ = env.step(ac)
        ob = np.squeeze(next_ob)
        total_reward += reward
    return total_reward


#######################################################
# Load the expert trajectories.
# Expert trajectories are the state-action pairs recorded when a human performs the control task.
# In this task  state is 3 dimensional vector (sin_theta, cos_theta and theta_dot)
# while action is a scalar (
#########################################################
try:
    expert_traj = np.load("expert_traj.npy")
except:
    print("Train, generate and save expert trajectories using ppo algorithm first")
    assert False


###############################################################
# Implements the discriminator class
#
#############################################################
class Discriminator:
    def __init__(self, ob_shape, ac_shape, hidden_size,batch_size,  lr, name='discriminator'):
        super(Discriminator, self).__init__()
        self.ob_shape = ob_shape
        self.ac_shape = ac_shape
        self.hidden_size = hidden_size
        self.batch_size=batch_size
        self.lr = lr
        self.name = name

        self.ones=tf.ones([self.batch_size,1])
        self.zeros=tf.zeros([self.batch_size,1])
        self.zeros_ones=tf.concat([self.zeros, self.ones], axis=0)
        self.ones_zeros=tf.concat([self.ones, self.zeros], axis=0)

        # Defined the combined observation-action input. i.e. first ob_shape[0] elements of each row is the observation
        # vector and the rest (ac_shape[0] elements) is the action vector.
        ob_ac=klayers.Input(shape=[ ob_shape[0] + ac_shape[0]])

        # Base part of the discrimator network,
        d_h1=klayers.Dense(self.hidden_size, activation='tanh', name='dense1')(ob_ac)
        d_h2 = klayers.Dense(self.hidden_size, activation='tanh', name='dense2')(d_h1)
        d_out = klayers.Dense(1, activation=None, name='dense3')(d_h2)

        # discriminator network outputs a reward which is the log probability of the network output
        # for a trajectory input (ie. policy observation-action input)
        reward = - tf.squeeze(tf.math.log(tf.sigmoid(d_out)))

        #Model to create the reward from observation/action pairs
        self.model_reward=tf.keras.Model(inputs=ob_ac, outputs=reward)

        #Model to train the discriminator
        self.model_prob=tf.keras.Model(inputs=ob_ac, outputs=d_out)

        self.model_prob.compile(optimizer=koptimizers.Adam(learning_rate=self.lr),
                                loss=[klosses.BinaryCrossentropy(from_logits=True), klosses.BinaryCrossentropy(from_logits=True)])


    def get_reward(self, _ob_ac):
        reward=self.model_reward.predict(_ob_ac)
        return reward

    def update(self, all_ob_ac):
        self.model_prob.train_on_batch(all_ob_ac,FILL_HERE) ###COMPLETE THIS LINE


#######################################################
# Gail training procedure starts from here
# First define the training hyper-parameters and then the
# training procedure itself.
#######################################################


############# Hyper parameters ########################
ppo_hidden_size           = 256
discriminator_hidden_size = 128
lr                        = 3e-4
num_steps                 = 20
mini_batch_size           = 5
ppo_epochs                = 4
threshold_reward          = -200

max_frames = 20000
frame_idx  = 0
plot_interval=1000 # Set this to a low value such as 10 to verify that the code responds
test_rewards = []

if __name__ == '__main__':   

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    env = gym.make(env_name)

    ############# Training procedure ###########################
    ob_shape = list(envs.observation_space.shape)
    ac_shape = list(envs.action_space.shape)

    ob = envs.reset()
    early_stop = False

    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True
    tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))


    ppo = PPO( ob_shape, ac_shape, lr, ppo_hidden_size)
    discriminator = Discriminator( ob_shape, ac_shape, discriminator_hidden_size, num_envs*num_steps, lr,)


    i_update = 0
    state = envs.reset()
    early_stop = False

    fig,ax = plt.subplots(1,1)
    ax.set_xlabel('frame_index/plot_interval') ; ax.set_ylabel('reward')

    while frame_idx < max_frames and not early_stop:
        i_update += 1
        
        values    = []
        obs    = []
        acs   = []
        rewards   = []
        masks     = []
        entropy = 0

        for _ in range(num_steps):

            ac = ppo.get_action(ob)
            next_ob, _, done, _ = envs.step(ac)
            reward = discriminator.get_reward(np.concatenate([ob, ac], axis=1))
            
            value = ppo.get_value(ob)
            values.append(value)
            rewards.append(reward[:, np.newaxis])
            masks.append((1-done)[:, np.newaxis])

            obs.append(ob)
            acs.append(ac)

            ob = next_ob
            frame_idx += 1

            if frame_idx % plot_interval == 0:
                test_reward = np.mean([test_env(ppo) for _ in range(10)])
                test_rewards.append(test_reward)

                ax.plot(range(int(frame_idx / plot_interval)), test_rewards)
                fig.canvas.draw()
                fig.canvas.flush_events()
                plt.pause(0.1)


                if test_reward > threshold_reward: early_stop = True
                

        next_value = ppo.get_value(next_ob)
        returns = compute_gae(next_value, rewards, masks, values)

        returns = np.concatenate(returns)
        values = np.concatenate(values)
        obs = np.concatenate(obs)
        acs = np.concatenate(acs)
        advantages = returns - values


        # Policy Update
        if i_update % 3 == 0:
            ppo.assign_old_pi()
            for _ in range(ppo_epochs):
                for ob_batch, ac_batch, return_batch, adv_batch in ppo_iter(mini_batch_size, obs, acs, returns, advantages):
                    ppo.update(ob_batch, ac_batch, return_batch, adv_batch)
        
        # Discriminator Update
        expert_ob_ac = expert_traj[np.random.randint(0, expert_traj.shape[0], num_steps * num_envs), :]
        policy_ob_ac = np.concatenate([obs, acs], 1)
        discriminator.update(np.concatenate([expert_ob_ac, policy_ob_ac], axis=0))
    print(test_rewards)