Commit 7fb38af8 authored by sjmonagi's avatar sjmonagi

images only

parent 6cf21ccc
import random
import numpy as np
import tensorflow as tf
random.seed(123)
np.random.seed(123)
class DRQN(object):
def __init__(self, action_n,
cell,
scope,
fcl_dims,
save_path,
input_size,
nodes_num,
gamma=0.98):
self.cell = cell
self.scope = scope
self.gamma = gamma
self.fc1_dims = fcl_dims
self.action_n = action_n
self.save_path = save_path
self.nodes_num = nodes_num
with tf.variable_scope(scope):
# seperate agent observation, and positions
self.inputs = tf.placeholder(tf.float32, shape=(None, 515), name="features_positions")
# additional goals
self.goals = tf.placeholder(tf.float32, shape=(None, 3), name="Goals_")
# previous_action
self.pre_action = tf.placeholder(tf.int32, shape=(None,), name="pre_action")
# actions
self.actions = tf.placeholder(tf.int32, shape=(None,), name="actions")
# Q-targets-values
self.Q_values = tf.placeholder(tf.float32, shape=(None,), name="Targets_Q_Values")
self.pre_action_ = tf.one_hot(self.pre_action, self.action_n, dtype=tf.float32,
name="pre_action_OneHot_enc")
lstm_input = tf.concat((self.inputs, self.goals), axis=1)
lstm_input_ = tf.concat((lstm_input, self.pre_action_), axis=1)
with tf.variable_scope("RNN"):
self.train_length = tf.placeholder(tf.int32)
self.batch_size = tf.placeholder(tf.int32, shape=[])
self.input_flat = tf.reshape(tf.layers.flatten(lstm_input_),
[self.batch_size, self.train_length, input_size])
# number_of_units may need to be changed
# self.cell = tf.contrib.rnn.BasicLSTMCell(num_units=self.nodes_num,
# state_is_tuple=True)
self.state_in = self.cell.zero_state(self.batch_size, tf.float32)
self.rnn, self.rnn_state = tf.nn.dynamic_rnn(inputs=self.input_flat,
cell=self.cell,
dtype=tf.float32,
initial_state=self.state_in,
scope=scope + '_rnn')
self.rnn_flat = tf.reshape(self.rnn, shape=[-1, self.nodes_num])
dense1 = tf.layers.dense(self.rnn_flat, self.fc1_dims, activation=tf.nn.relu, trainable=True)
# final output layer
self.predict_op = tf.layers.dense(dense1, action_n, trainable=True)
actions_q_values = tf.reduce_sum(self.predict_op * tf.one_hot(self.actions, self.action_n),
reduction_indices=[1])
# self.clipped_Q_values = tf.clip_by_value(self.Q_values, -1 / (1 - self.gamma), 0)
self.cost = tf.reduce_mean(tf.square(self.Q_values - actions_q_values))
self.train_op = tf.train.AdamOptimizer(1e-3).minimize(self.cost)
tf.summary.scalar("Cost", self.cost)
tf.summary.histogram("Goals", self.goals)
tf.summary.histogram("Action_Q_values", self.Q_values)
tf.summary.histogram("LSTM", self.rnn)
tf.summary.histogram("LSTM_State", self.rnn_state)
self.merged = tf.summary.merge_all()
def hard_update_from(self, other):
mine = [t for t in tf.trainable_variables() if t.name.startswith(self.scope)]
mine = sorted(mine, key=lambda v: v.name)
theirs = [t for t in tf.trainable_variables() if t.name.startswith(other.scope)]
theirs = sorted(theirs, key=lambda v: v.name)
self.session.run([v_t.assign(v) for v_t, v in zip(mine, theirs)])
def soft_update_from(self, other, tau=0.95):
mine = [t for t in tf.trainable_variables() if t.name.startswith(self.scope)]
mine = sorted(mine, key=lambda v: v.name)
theirs = [t for t in tf.trainable_variables() if t.name.startswith(other.scope)]
theirs = sorted(theirs, key=lambda v: v.name)
self.session.run([v_t.assign(v_t * (1. - tau) + v * tau) for v_t, v in zip(mine, theirs)])
def set_session(self, session):
self.session = session
def predict(self, pos_obs_state, goals, batch_size, trace_length, rnn_state, pre_action):
actions_q_values, rnn, rnn_state_ = self.session.run([self.predict_op, self.rnn, self.rnn_state],
feed_dict={self.goals: goals,
self.state_in: rnn_state,
self.inputs: pos_obs_state,
self.batch_size: batch_size,
self.pre_action: pre_action,
self.train_length: trace_length})
return actions_q_values, rnn, rnn_state_
def update(self, goals, states, actions, batch_size, q_values, trace_length, rnn_state, pre_action):
self.c, _ = self.session.run([self.cost, self.train_op],
feed_dict={self.goals: goals,
self.inputs: states,
self.actions: actions,
self.Q_values: q_values,
self.state_in: rnn_state,
self.batch_size: batch_size,
self.pre_action: pre_action,
self.train_length: trace_length})
return self.c
def sample_action(self, goal, batch_size, trace_length, epsilon, rnn_state, pos_obs_state, pre_action):
"""Implements epsilon greedy algorithm"""
if np.random.random() < epsilon:
q_values, rnn, rnn_state_ = self.predict(pos_obs_state=[pos_obs_state],
goals=[goal],
pre_action=[pre_action],
batch_size=batch_size,
trace_length=trace_length,
rnn_state=rnn_state)
action = np.random.randint(1, self.action_n)
else:
action_q_values, _, rnn_state_ = self.predict(pos_obs_state=[pos_obs_state],
goals=[goal],
pre_action=[pre_action],
batch_size=batch_size,
trace_length=trace_length,
rnn_state=rnn_state)
action = np.argmax(action_q_values[0])
return action, rnn_state_
def load(self):
self.saver = tf.train.Saver(tf.global_variables())
load_was_success = True
try:
save_dir = '/'.join(self.save_path.split('/')[:-1])
ckpt = tf.train.get_checkpoint_state(save_dir)
load_path = ckpt.model_checkpoint_path
self.saver.restore(self.session, load_path)
except:
print("no saved model to load. starting new session")
load_was_success = False
else:
print("loaded model: {}".format(load_path))
saver = tf.train.Saver(tf.global_variables())
episode_number = int(load_path.split('-')[-1])
def save(self, n):
self.saver.save(self.session, self.save_path, global_step=n)
print("SAVED MODEL #{}".format(n))
def optimize(self, model, target_model, batch_size, trace_length, her_buffer, optimization_steps):
losses = 0
for _ in range(optimization_steps):
rnn_stat_train = (np.zeros([batch_size, self.nodes_num]), np.zeros([batch_size, self.nodes_num]))
train_batch = her_buffer.sample(batch_size=batch_size, trace_length=trace_length)
pre_action, states, curr_actions, rewards, next_states, dones, goals = map(np.array, zip(*train_batch))
# Calculate targets
next_Qs, _, _ = target_model.predict(goals=goals,
pre_action=pre_action,
pos_obs_state=next_states,
rnn_state=rnn_stat_train,
trace_length=trace_length,
batch_size=batch_size)
next_Q = np.amax(next_Qs, axis=1)
target_q_values = rewards + np.invert(dones).astype(np.float32) * self.gamma * next_Q
# Calculate network loss
loss = model.update(goals=goals,
states=states,
actions=curr_actions,
pre_action=pre_action,
rnn_state=rnn_stat_train,
q_values=target_q_values,
trace_length=trace_length,
batch_size=batch_size)
losses += loss
return losses / optimization_steps
def log(self, encoder_summary, drqn_summary, step):
encoder_writer = tf.summary.FileWriter("/home/nagi/Desktop/Master_Project/DRQN_features_pos_2/encoder")
encoder_writer.add_summary(encoder_summary, global_step=step)
writer = tf.summary.FileWriter("/home/nagi/Desktop/Master_Project/DRQN_features_pos_2/Train")
writer.add_summary(drqn_summary, global_step=step)
import random
import numpy as np
import pandas as pd
import tensorflow as tf
#matplotlib.use("TkAgg")
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from DRQN_HER import DRQN
from Environment import Environment
from Environment_top_view import Environment_topview
from Her_episodes_experiences import her_buffer
from autoencoder import load_autoencoder
from experience_buffer import experience_buffer
from helper import train_valid_env_sync
random.seed(123)
np.random.seed(123)
dir = "/previous_Action_modified_her_with_sequence/DRQN.ckpt"
##### environment_Variables
grid_size = 0.18 # size of the agent step
top_view = True # displaying top-view
distance_threshold = grid_size * 3 # distance threshold to the goal
action_n = 6 #number of allowed action
random_init_position = False # Random initial positions only -- no change in the agent orientation
random_init_pose = True # Random initial positions with random agent orientation
reward = "shaped" # reward type "shaped","sparse"
######################### hyper-parameter
num_episodes = 50100
her_strategy = "future"
her_samples = 4
batch_size = 32
trace_length = 16
gamma = 0.99
fcl_dims = 512
nodes_num = 256
optimistion_steps = 40
epsilon_max = 1
epsilon_min = 0.05
epsilon_decay = epsilon_max - ((epsilon_max - epsilon_min) / 20000)
plotted_data = pd.DataFrame(
columns=["Episodes", "Successful trajectories", "Failed trajectories", "Ratio", "loss", "epsilon"])
legend_elements = [Line2D([0], [0], marker="o", color="white", label="Navigable Positions",
markerfacecolor="grey", markersize=10),
Line2D([0], [0], marker="X", color="white", label="Goal Positions",
markerfacecolor="grey", markersize=10),
Line2D([0], [0], marker="o", color="white", label="Initial Agent Position",
markerfacecolor="blue", markersize=10),
Line2D([0], [0], marker="v", color="white", label="Looking Right",
markerfacecolor="red", markersize=10),
Line2D([0], [0], marker="^", color="white", label="Looking Left",
markerfacecolor="grey", markersize=10),
Line2D([0], [0], marker="<", color="white", label="looking Back",
markerfacecolor="red", markersize=10),
]
# experience replay parameters
her_rec_buffer = her_buffer()
episode_buffer = experience_buffer(distance=distance_threshold, reward_typ=reward, her_samples=her_samples, her_strategy=her_strategy)
env = Environment(random_init_position=random_init_position, random_init_pos_orient=random_init_pose, reward_typ=reward,
distance=distance_threshold, random_goals=True, grid_size=grid_size, agent_mode="bot")
if top_view:
envT = Environment_topview(grid_size=grid_size, agent_mode="bot", distance=distance_threshold, reward_typ=reward)
positions = env.get_reachable_position()
plt.ion()
# Autoenconder
print("Autoencoder")
ae_sess, ae = load_autoencoder()
global_step = tf.Variable(0, name="global_step", trainable=False)
loss = 0
# main loop
print("DQN_HER_Model")
drqn_graph = tf.Graph()
model = DRQN(action_n=action_n, nodes_num=nodes_num, fcl_dims=fcl_dims, scope="model",save_path=dir)
# target_model = DRQN(action_n=action_n, nodes_num=nodes_num, fcl_dims=fcl_dims, scope="target_model",
# save_path=dir)
with tf.Session() as sess:
model.set_session(sess)
# target_model.set_session(sess)
sess.run(tf.global_variables_initializer())
model.load()
start = global_step.eval(sess)
successes = 0
failures = 0
epsilon = 1
for n in range(start, num_episodes):
# rnn_init_state
rnn_state = (np.zeros([1, nodes_num]), np.zeros([1, nodes_num]))
# reset environment
obs_state, pos_state, goal, distance, pose, pre_action_idx = env.reset()
if top_view:
# additional env top view for validation
agent_pos_top, pose_top = envT.reset(x_pos=pose[0],
y_pos=pose[1],
z_pos=pose[2],
angle=pose[4])
if top_view:
# validation the position of the agent from two diff environment_object
train_valid_env_sync(pose, pose_top)
features = ae_sess.run(ae.feature_vector, feed_dict={ae.image: obs_state[None, :, :, :]})
features = np.squeeze(features, axis=0)
obs_pos_state = np.concatenate((features, pos_state), axis=0)
plt.close()
plt.figure()
plt.ion()
for pos in positions:
plt.scatter(pos[0], pos[2], s=20, c="grey", marker="o", alpha=1)
x_start, x_end = plt.xlim()
y_start, y_end = plt.ylim()
plt.xticks(np.arange(x_start, x_end, grid_size), rotation=90)
plt.yticks(np.arange((y_start - grid_size), y_end, grid_size))
plt.xlabel("X-Coordinates")
plt.ylabel("Y-Coordinates")
plt.legend(handles=legend_elements, loc="upper left", bbox_to_anchor=(1, 1), prop={"size": 6})
plt.grid()
plt.tight_layout()
plt.scatter(pos_state[0], pos_state[2], c="blue", marker="o")
plt.scatter(goal[0], goal[2], c="green", marker="X")
plt.pause(0.9)
if pose[4]==0:
plt.scatter(pos_state[0], pos_state[2], c="white", marker="s", alpha=1)
plt.scatter(pos_state[0], pos_state[2], c="red", marker="v", alpha=1)
elif pose[4]==90:
plt.scatter(pos_state[0], pos_state[2], c="white", marker="s", alpha=1)
plt.scatter(pos_state[0], pos_state[2], c="red", marker=">", alpha=1)
elif pose[4]==180:
plt.scatter(pos_state[0], pos_state[2], c="white", marker="s", alpha=1)
plt.scatter(pos_state[0], pos_state[2], c="red", marker="^", alpha=1)
elif pose[4]==270:
plt.scatter(pos_state[0], pos_state[2], c="white", marker="s", alpha=1)
plt.scatter(pos_state[0], pos_state[2], c="red", marker="<", alpha=1)
plt.pause(0.01)
done = False
while not done:
curr_action_idx, rnn_state_ = model.sample_action(goal=goal,
batch_size=1,
trace_length=1,
epsilon=0,
rnn_state=rnn_state,
pos_obs_state=obs_pos_state,
pre_action=pre_action_idx)
obs_state_, pos_state_, distance_, done, reward, collision, pose_ = env.step(curr_action_idx,
goal, distance)
if top_view:
# top view environment used for verification of the main environment
obsStateT, posStateT, distanceT, doneT, rewardT, collisionT, agentPoseT = envT.step(curr_action_idx,
goal, distance)
if top_view:
# validation the postion of the agent from two diff environment_object
train_valid_env_sync(pose_, agentPoseT)
features_ = ae_sess.run(ae.feature_vector, feed_dict={ae.image: obs_state_[None, :, :, :]})
features_ = np.squeeze(features_, axis=0)
obs_pos_state_ = np.concatenate((features_, pos_state_), axis=0)
if pose[4] == 0:
plt.scatter(pos_state[0], pos_state[2], c="white", marker="s", alpha=1)
plt.scatter(pos_state[0], pos_state[2], c="red", marker="v", alpha=1)
elif pose[4] == 90:
plt.scatter(pos_state[0], pos_state[2], c="white", marker="s", alpha=1)
plt.scatter(pos_state[0], pos_state[2], c="red", marker=">", alpha=1)
elif pose[4] == 180:
plt.scatter(pos_state[0], pos_state[2], c="white", marker="s", alpha=1)
plt.scatter(pos_state[0], pos_state[2], c="red", marker="^", alpha=1)
elif pose[4] == 270:
plt.scatter(pos_state[0], pos_state[2], c="white", marker="s", alpha=1)
plt.scatter(pos_state[0], pos_state[2], c="red", marker="<", alpha=1)
plt.pause(0.01)
# # append to episode buffer
# episode_buffer.add(np.reshape(
# np.array([pre_action_idx, obs_pos_state, curr_action_idx, reward, obs_pos_state_, done, goal]),
# [1, 7]))
rnn_state = rnn_state_
obs_pos_state = obs_pos_state_
distance = distance_
pre_action_idx = curr_action_idx
if done:
if distance <= distance_threshold:
successes += done
else:
failures += done
break
# her_buffer = episode_buffer.her()
# her_rec_buffer.add(her_buffer)
# episode_buffer.clear()
#
# if n > 50 and n != 0:
# loss = model.optimize(model=model,
# batch_size=batch_size,
# trace_length=trace_length,
# target_model=target_model,
# her_buffer=her_rec_buffer,
# optimization_steps=optimistion_steps)
# if n % 100 == 0 and n > 0:
# print("--update model--")
# target_model.soft_update_from(model)
#
# # model.log(drqn_summary=drqn_summary, encoder_summary=ae_summary, step=start)
#
# epsilon = max(epsilon * epsilon_decay, epsilon_min)
plotted_data = plotted_data.append({"Episodes": str(n),
"Successful trajectories": successes / (n + 1),
"Failed trajectories": failures / (n + 1),
"Ratio": (successes / (failures + 1e-6)),
"loss": loss, "epsilon": epsilon}, ignore_index=True)
#plotting_training_log(n, plotted_data, successes, failures, loss, goal, distance, pos_state, epsilon)
# global_step.assign(n).eval()
# # saving
# if n % 50 == 0 and n > 0:
# model.save(n)
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from DRQN_HER import DRQN
from Environment import Environment
from Environment_top_view import Environment_topview
from Her_episodes_experiences import her_buffer
from autoencoder import load_autoencoder
from experience_buffer import experience_buffer
from helper import plotting_training_log, train_valid_env_sync, validate
random.seed(123)
np.random.seed(123)
dir = "/home/nagi/Desktop/Master_project_final/DRQN_3_shaped_reward_only_sequence/DRQN.ckpt"
##### environment_Variables
grid_size = 0.18 # size of the agent step
top_view = True # displaying top-view
distance_threshold = grid_size * 2 # distance threshold to the goal
action_n = 3 # number of allowed action
random_init_position = False # Random initial positions only -- no change in the agent orientation
random_init_pose = True # Random initial positions with random agent orientation
reward = "shaped" # reward type "shaped","sparse"
######################### hyper-parameter
num_episodes = 15001
her_samples = 8
batch_size = 32
trace_length = 8
gamma = 0.99
fcl_dims = 512
nodes_num = 256
optimistion_steps = 40
epsilon_max = 1
epsilon_min = 0.001
input_size = 521 ## size of the input to the LSTM
epsilon_decay = epsilon_max - ((epsilon_max / 3500))
## pandas data-frame for plotting
plotted_data = pd.DataFrame(
columns=["Episodes", "Successful trajectories", "Failed trajectories", "Ratio", "loss", "epsilon"])
# experience replay parameters
her_rec_buffer = her_buffer()
episode_buffer = experience_buffer(distance=distance_threshold, reward_typ=reward, her_samples=her_samples)
env = Environment(random_init_position=random_init_position, random_init_pos_orient=random_init_pose, reward_typ=reward,
distance=distance_threshold, random_goals=True, grid_size=grid_size, agent_mode="bot")
if top_view:
envT = Environment_topview(grid_size=grid_size, agent_mode="bot", distance=distance_threshold, reward_typ=reward)
# Autoenconder
print("Autoencoder")
ae_sess, ae = load_autoencoder()
global_step = tf.Variable(0, name="global_step", trainable=False)
loss = 0
# main loop
print("DQN_HER_Model")
drqn_graph = tf.Graph()
cell = tf.nn.rnn_cell.LSTMCell(num_units=nodes_num, state_is_tuple=True)
cellT = tf.nn.rnn_cell.LSTMCell(num_units=nodes_num, state_is_tuple=True)
model = DRQN(action_n=action_n, cell=cell, fcl_dims=fcl_dims, scope="model",
save_path=dir, nodes_num=nodes_num, input_size=input_size)
target_model = DRQN(action_n=action_n, cell=cellT, fcl_dims=fcl_dims, scope="target_model",
save_path=dir, nodes_num=nodes_num, input_size=input_size)
print("##### Env with grid_size equals", grid_size, "and", reward, "reward ######")
with tf.Session() as sess:
model.set_session(sess)
target_model.set_session(sess)
sess.run(tf.global_variables_initializer())
model.load()
start = global_step.eval(sess)
successes = 0
failures = 0
epsilon = 1
for n in range(start, num_episodes):
step_num = 0
# rnn_init_state
rnn_state = (np.zeros([1, nodes_num]), np.zeros([1, nodes_num]))
# reset environment
obs_state, pos_state, goal, distance, pose, pre_action_idx = env.reset()
if top_view:
# additional env top view for validation
agent_pos_top, pose_top = envT.reset(x_pos=pose[0],
y_pos=pose[1],
z_pos=pose[2],
angle=pose[4])
if top_view:
# validation the position of the agent from two diff environment_object
train_valid_env_sync(pose, pose_top)
features = ae_sess.run(ae.feature_vector, feed_dict={ae.image: obs_state[None, :, :, :]})
features = np.squeeze(features, axis=0)
obs_pos_state = np.concatenate((features, pos_state), axis=0)
done = False
while not done:
curr_action_idx, rnn_state_ = model.sample_action(goal=goal,
batch_size=1,
trace_length=1,
epsilon=epsilon,
rnn_state=rnn_state,
pos_obs_state=obs_pos_state