Commit aae9c66d authored by sjmonagi's avatar sjmonagi
Browse files

playing

parent 37f1c80d
import random
import numpy as np
import tensorflow as tf
random.seed(123)
np.random.seed(123)
class DRQN(object):
def __init__(self, action_n,
cell,
scope,
fcl_dims,
save_path,
input_size,
nodes_num,
gamma=0.98):
self.cell = cell
self.scope = scope
self.gamma = gamma
self.fc1_dims = fcl_dims
self.action_n = action_n
self.save_path = save_path
self.nodes_num = nodes_num
with tf.variable_scope(scope):
# seperate agent observation, and positions
self.inputs = tf.placeholder(tf.float32, shape=(None, 515), name="features_positions")
# additional goals
self.goals = tf.placeholder(tf.float32, shape=(None, 3), name="Goals_")
# previous_action
self.pre_action = tf.placeholder(tf.int32, shape=(None,), name="pre_action")
# actions
self.actions = tf.placeholder(tf.int32, shape=(None,), name="actions")
# Q-targets-values
self.Q_values = tf.placeholder(tf.float32, shape=(None,), name="Targets_Q_Values")
self.pre_action_ = tf.one_hot(self.pre_action, self.action_n, dtype=tf.float32,
name="pre_action_OneHot_enc")
lstm_input = tf.concat((self.inputs, self.goals), axis=1)
lstm_input_ = tf.concat((lstm_input, self.pre_action_), axis=1)
with tf.variable_scope("RNN"):
self.train_length = tf.placeholder(tf.int32)
self.batch_size = tf.placeholder(tf.int32, shape=[])
self.input_flat = tf.reshape(tf.layers.flatten(lstm_input_),
[self.batch_size, self.train_length, input_size])
# number_of_units may need to be changed
# self.cell = tf.contrib.rnn.BasicLSTMCell(num_units=self.nodes_num,
# state_is_tuple=True)
self.state_in = self.cell.zero_state(self.batch_size, tf.float32)
self.rnn, self.rnn_state = tf.nn.dynamic_rnn(inputs=self.input_flat,
cell=self.cell,
dtype=tf.float32,
initial_state=self.state_in,
scope=scope + '_rnn')
self.rnn_flat = tf.reshape(self.rnn, shape=[-1, self.nodes_num])
dense1 = tf.layers.dense(self.rnn_flat, self.fc1_dims, activation=tf.nn.relu, trainable=True)
# final output layer
self.predict_op = tf.layers.dense(dense1, action_n, trainable=True)
actions_q_values = tf.reduce_sum(self.predict_op * tf.one_hot(self.actions, self.action_n),
reduction_indices=[1])
# self.clipped_Q_values = tf.clip_by_value(self.Q_values, -1 / (1 - self.gamma), 0)
self.cost = tf.reduce_mean(tf.square(self.Q_values - actions_q_values))
self.train_op = tf.train.AdamOptimizer(1e-3).minimize(self.cost)
tf.summary.scalar("Cost", self.cost)
tf.summary.histogram("Goals", self.goals)
tf.summary.histogram("Action_Q_values", self.Q_values)
tf.summary.histogram("LSTM", self.rnn)
tf.summary.histogram("LSTM_State", self.rnn_state)
self.merged = tf.summary.merge_all()
def hard_update_from(self, other):
mine = [t for t in tf.trainable_variables() if t.name.startswith(self.scope)]
mine = sorted(mine, key=lambda v: v.name)
theirs = [t for t in tf.trainable_variables() if t.name.startswith(other.scope)]
theirs = sorted(theirs, key=lambda v: v.name)
self.session.run([v_t.assign(v) for v_t, v in zip(mine, theirs)])
def soft_update_from(self, other, tau=0.95):
mine = [t for t in tf.trainable_variables() if t.name.startswith(self.scope)]
mine = sorted(mine, key=lambda v: v.name)
theirs = [t for t in tf.trainable_variables() if t.name.startswith(other.scope)]
theirs = sorted(theirs, key=lambda v: v.name)
self.session.run([v_t.assign(v_t * (1. - tau) + v * tau) for v_t, v in zip(mine, theirs)])
def set_session(self, session):
self.session = session
def predict(self, pos_obs_state, goals, batch_size, trace_length, rnn_state, pre_action):
actions_q_values, rnn, rnn_state_ = self.session.run([self.predict_op, self.rnn, self.rnn_state],
feed_dict={self.goals: goals,
self.state_in: rnn_state,
self.inputs: pos_obs_state,
self.batch_size: batch_size,
self.pre_action: pre_action,
self.train_length: trace_length})
return actions_q_values, rnn, rnn_state_
def update(self, goals, states, actions, batch_size, q_values, trace_length, rnn_state, pre_action):
self.c, _ = self.session.run([self.cost, self.train_op],
feed_dict={self.goals: goals,
self.inputs: states,
self.actions: actions,
self.Q_values: q_values,
self.state_in: rnn_state,
self.batch_size: batch_size,
self.pre_action: pre_action,
self.train_length: trace_length})
return self.c
def sample_action(self, goal, batch_size, trace_length, epsilon, rnn_state, pos_obs_state, pre_action):
"""Implements epsilon greedy algorithm"""
if np.random.random() < epsilon:
q_values, rnn, rnn_state_ = self.predict(pos_obs_state=[pos_obs_state],
goals=[goal],
pre_action=[pre_action],
batch_size=batch_size,
trace_length=trace_length,
rnn_state=rnn_state)
action = np.random.randint(1, self.action_n)
else:
action_q_values, _, rnn_state_ = self.predict(pos_obs_state=[pos_obs_state],
goals=[goal],
pre_action=[pre_action],
batch_size=batch_size,
trace_length=trace_length,
rnn_state=rnn_state)
action = np.argmax(action_q_values[0])
return action, rnn_state_
def load(self):
self.saver = tf.train.Saver(tf.global_variables())
load_was_success = True
try:
save_dir = '/'.join(self.save_path.split('/')[:-1])
ckpt = tf.train.get_checkpoint_state(save_dir)
load_path = ckpt.model_checkpoint_path
self.saver.restore(self.session, load_path)
except:
print("no saved model to load. starting new session")
load_was_success = False
else:
print("loaded model: {}".format(load_path))
saver = tf.train.Saver(tf.global_variables())
episode_number = int(load_path.split('-')[-1])
def save(self, n):
self.saver.save(self.session, self.save_path, global_step=n)
print("SAVED MODEL #{}".format(n))
def optimize(self, model, target_model, batch_size, trace_length, her_buffer, optimization_steps):
losses = 0
for _ in range(optimization_steps):
rnn_stat_train = (np.zeros([batch_size, self.nodes_num]), np.zeros([batch_size, self.nodes_num]))
train_batch = her_buffer.sample(batch_size=batch_size, trace_length=trace_length)
pre_action, states, curr_actions, rewards, next_states, dones, goals = map(np.array, zip(*train_batch))
# Calculate targets
next_Qs, _, _ = target_model.predict(goals=goals,
pre_action=pre_action,
pos_obs_state=next_states,
rnn_state=rnn_stat_train,
trace_length=trace_length,
batch_size=batch_size)
next_Q = np.amax(next_Qs, axis=1)
target_q_values = rewards + np.invert(dones).astype(np.float32) * self.gamma * next_Q
# Calculate network loss
loss = model.update(goals=goals,
states=states,
actions=curr_actions,
pre_action=pre_action,
rnn_state=rnn_stat_train,
q_values=target_q_values,
trace_length=trace_length,
batch_size=batch_size)
losses += loss
return losses / optimization_steps
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from Agent_shaped_reward_wiz_her_images_and_postions_6actions.DRQN_HER import DRQN
from Agent_shaped_reward_wiz_her_images_and_postions_6actions.Environment import Environment
from Agent_shaped_reward_wiz_her_images_and_postions_6actions.Environment_top_view import Environment_topview
from Agent_shaped_reward_wiz_her_images_and_postions_6actions.Her_episodes_experiences import her_buffer
from Agent_shaped_reward_wiz_her_images_and_postions_6actions.autoencoder import load_autoencoder
from Agent_shaped_reward_wiz_her_images_and_postions_6actions.experience_buffer import experience_buffer
from Agent_shaped_reward_wiz_her_images_and_postions_6actions.helper import plotting_training_log, train_valid_env_sync, validate
random.seed(123)
np.random.seed(123)
fields_name = ["iteration", "successes"]
dir = "/home/nagi/Desktop/Master_project_final/DRQN_3_her_sparse_image_and_pos_F1_6_Actions/DRQN.ckpt"
##### environment_Variables
grid_size = 0.18 # size of the agent step
top_view = True # displaying top-view
distance_threshold = grid_size * 2 # distance threshold to the goal
action_n = 6 # number of allowed action
random_init_position = False # Random initial positions only -- no change in the agent orientation
random_init_pose = True # Random initial positions with random agent orientation
reward = "shaped" # reward type "shaped","sparse"
######################### hyper-parameter
num_episodes = 15001
her_samples = 8
batch_size = 32
trace_length = 8
gamma = 0.99
fcl_dims = 512
nodes_num = 256
optimistion_steps = 40
epsilon_max = 1
epsilon_min = 0
input_size = 524 ## size of the input to the LSTM
epsilon_decay = epsilon_max - (epsilon_max / 3000)
## pandas data-frame for plotting
plotted_data = pd.DataFrame(
columns=["Episodes", "Successful trajectories", "Failed trajectories", "Ratio", "loss", "epsilon", "F1"])
# experience replay parameters
her_rec_buffer = her_buffer()
episode_buffer = experience_buffer(distance=distance_threshold, reward_typ=reward, her_samples=her_samples)
env = Environment(random_init_position=random_init_position, random_init_pos_orient=random_init_pose, reward_typ=reward,
distance=distance_threshold, random_goals=True, grid_size=grid_size, agent_mode="bot")
if top_view:
envT = Environment_topview(grid_size=grid_size, agent_mode="bot", distance=distance_threshold, reward_typ=reward)
# Autoenconder
print("Autoencoder")
ae_sess, ae = load_autoencoder()
global_step = tf.Variable(0, name="global_step", trainable=False)
loss = 0
# main loop
print("DQN_HER_Model")
drqn_graph = tf.Graph()
cell = tf.nn.rnn_cell.LSTMCell(num_units=nodes_num, state_is_tuple=True)
cellT = tf.nn.rnn_cell.LSTMCell(num_units=nodes_num, state_is_tuple=True)
model = DRQN(action_n=action_n, cell=cell, fcl_dims=fcl_dims, scope="model",
save_path=dir, nodes_num=nodes_num, input_size=input_size)
target_model = DRQN(action_n=action_n, cell=cellT, fcl_dims=fcl_dims, scope="target_model",
save_path=dir, nodes_num=nodes_num, input_size=input_size)
print("##### Env with grid_size equals", grid_size, "and", reward, "reward ######")
with tf.Session() as sess:
model.set_session(sess)
target_model.set_session(sess)
sess.run(tf.global_variables_initializer())
model.load()
start = global_step.eval(sess)
successes = 0
failures = 0
epsilon = 1
for n in range(start, num_episodes):
step_num = 0
# rnn_init_state
rnn_state = (np.zeros([1, nodes_num]), np.zeros([1, nodes_num]))
# reset environment
obs_state, pos_state, goal, distance, pose, pre_action_idx = env.reset()
if top_view:
# additional env top view for validation
agent_pos_top, pose_top = envT.reset(x_pos=pose[0],
y_pos=pose[1],
z_pos=pose[2],
angle=pose[4])
if top_view:
# validation the position of the agent from two diff environment_object
train_valid_env_sync(pose, pose_top)
features = ae_sess.run(ae.feature_vector, feed_dict={ae.image: obs_state[None, :, :, :]})
features = np.squeeze(features, axis=0)
obs_pos_state = np.concatenate((features, pos_state), axis=0)
done = False
while not done:
curr_action_idx, rnn_state_ = model.sample_action(goal=goal,
batch_size=1,
trace_length=1,
epsilon=epsilon,
rnn_state=rnn_state,
pos_obs_state=obs_pos_state,
pre_action=pre_action_idx)
obs_state_, pos_state_, distance_, done, reward, collision, pose_ = env.step(curr_action_idx,
goal, distance)
if top_view:
# top view environment used for verification of the main environment
obsStateT, posStateT, distanceT, doneT, rewardT, collisionT, agentPoseT = envT.step(curr_action_idx,
goal, distance)
if top_view:
# validation the postion of the agent from two diff environment_object
train_valid_env_sync(pose_, agentPoseT)
features_ = ae_sess.run(ae.feature_vector, feed_dict={ae.image: obs_state_[None, :, :, :]})
features_ = np.squeeze(features_, axis=0)
obs_pos_state_ = np.concatenate((features_, pos_state_), axis=0)
# append to episode buffer
episode_buffer.add(np.reshape(
np.array([pre_action_idx, obs_pos_state, curr_action_idx, reward, obs_pos_state_, done, goal]),
[1, 7]))
rnn_state = rnn_state_
obs_pos_state = obs_pos_state_
distance = distance_
pre_action_idx = curr_action_idx
step_num += 1
if done:
if distance < distance_threshold:
successes += done
else:
failures += done
break
if step_num == 200:
done = True
failures += done
break
her_buffer = episode_buffer.her()
her_rec_buffer.add(her_buffer)
episode_buffer.clear()
plotted_data = plotted_data.append({"Episodes": str(n),
"Successful trajectories": successes / (n + 1),
"Failed trajectories": failures / (n + 1),
"Ratio": (successes / (failures + 1)),
"loss": loss, "epsilon": epsilon,
"F1": ((1-(failures / (n + 1))) * (successes / ( n + 1))) /
(((1-(failures / (n + 1))) + ((successes / ( n + 1))))+1)}, ignore_index=True)
plotting_training_log(n, plotted_data, successes, failures, loss, goal, distance, pos_state, epsilon, step_num)
###validation###
if n % 2000 == 0 and n > 0:
validate(n=n, nodes_num=nodes_num, top_view=top_view, env=env, envT=envT, ae=ae, ae_sess=ae_sess,
distance_threshold=distance_threshold, model=model)
if n > 100 and n != 0:
loss = model.optimize(model=model,
batch_size=batch_size,
trace_length=trace_length,
target_model=target_model,
her_buffer=her_rec_buffer,
optimization_steps=optimistion_steps)
if n % 4000 == 0 and n > 0:
print("#### update model ####")
target_model.soft_update_from(model)
epsilon = max(epsilon * epsilon_decay, epsilon_min)
global_step.assign(n).eval()
# saving
if n % 50 == 0 and n > 0:
model.save(n)
import random
import re
import ai2thor.controller
import numpy as np
import pandas as pd
import ai2thor.controller
random.seed(123)
np.random.seed(123)
class Environment(object):
def __init__(self,
distance,
reward_typ,
action_n=6,
grid_size=0.15,
visibility_distance=1.5,
player_screen_width=300,
player_screen_height=300,
full_scrn=False,
depth_image=False,
random_init_position=False,
random_init_pos_orient=False,
random_goals=False,
scene="FloorPlan225",
agent_mode="tall"):
self.scene = scene
self.action_n = action_n
self.distance = distance
self.grid_size = grid_size
self.full_scrn = full_scrn
self.reward_typ = reward_typ
self.agent_mode = agent_mode
self.depth_image = depth_image
self.random_goal = random_goals
self.visibility_distance = visibility_distance
self.player_screen_width = player_screen_width
self.player_screen_height = player_screen_height
self.random_init_position = random_init_position
self.random_init_pos_orient = random_init_pos_orient
self.orientations = [0.0, 90.0, 180.0, 270.0, 360.0]
self.ctrl = ai2thor.controller.Controller(scene=self.scene,
gridSize=self.grid_size,
renderDepthImage=self.depth_image,
visibilityDistance=self.visibility_distance,
agentMode=self.agent_mode)
def reset(self):
new_random_goal = 0
self.ctrl.reset(self.scene)
agent_init_position, random_goal = self.random_positions()
if self.random_init_pos_orient:
# Random init Agent positions and orientation
self.ctrl.step(action="TeleportFull",
x=agent_init_position[0],
y=agent_init_position[1],
z=agent_init_position[2],
rotation=random.choice(self.orientations),
horizon=0.0)
elif self.random_init_position:
# Random init Agent positions only
self.ctrl.step(action="Teleport",
x=agent_init_position[0],
y=agent_init_position[1],
z=agent_init_position[2])
else:
pass
if self.random_goal:
new_random_goal = random_goal
agent_position, agent_rotation, agent_pose = self.agent_properties()
try:
np.array_equal(np.array(list(self.ctrl.last_event.metadata["agent"]["position"].values())), agent_position)
except:
print("agent init position does not equal to agent position attribute")
pre_action_idx = 0
# if the agent init_position equals the goal position respawn the agent in different position
new_goal = self.agent_goal_pos_not_equal(agent_position, new_random_goal)
agent_pos_dis = np.linalg.norm(new_goal - agent_position)
first_person_obs = self.ctrl.last_event.frame
return first_person_obs, agent_position, new_goal, agent_pos_dis, agent_pose, pre_action_idx
def step(self, action, goal, distance):
first_person_obs, agent_position, distance_, done, reward, collision, agent_pose = 0, 0, 0, 0, 0, 0, 0
if action == 0:
self.ctrl.step(action='RotateRight')
reward, done, distance_, first_person_obs, collision, agent_position, agent_pose = self.post_action_state(
goal,
distance)
elif action == 1:
self.ctrl.step(action='RotateLeft')
reward, done, distance_, first_person_obs, collision, agent_position, agent_pose = self.post_action_state(
goal,
distance)
elif action == 2:
self.ctrl.step(action="MoveAhead")
reward, done, distance_, first_person_obs, collision, agent_position, agent_pose = self.post_action_state(
goal,
distance)
elif action == 3:
self.ctrl.step(action="MoveBack")
reward, done, distance_, first_person_obs, collision, agent_position, agent_pose = self.post_action_state(
goal,
distance)
elif action == 4:
self.ctrl.step(action="MoveRight")
reward, done, distance_, first_person_obs, collision, agent_position, agent_pose = self.post_action_state(
goal,
distance)
elif action == 5:
self.ctrl.step(action="MoveLeft")
reward, done, distance_, first_person_obs, collision, agent_position, agent_pose = self.post_action_state(
goal,
distance)
return first_person_obs, agent_position, distance_, done, reward, collision, agent_pose
def agent_properties(self):
agent_position = np.array(list(self.ctrl.last_event.metadata["agent"]["position"].values()))
agent_rotation = np.array(list(self.ctrl.last_event.metadata["agent"]["rotation"].values()))
agent_pose = np.concatenate((agent_position, agent_rotation), axis=0)
return agent_position, agent_rotation, agent_pose
def get_reachable_position(self):
self.ctrl.step(action='GetReachablePositions')
return pd.DataFrame(self.ctrl.last_event.metadata["reachablePositions"]).values
def random_positions(self):
while True:
positions = self.get_reachable_position()
random_positions = random.sample(list(positions), 2)
agent_pos = random_positions[0]
goal_pos = random_positions[1]
distance = np.linalg.norm(goal_pos - agent_pos)
if distance > 1.5*self.distance:
break
else:
print("Agent to Goal distance less than", 1.5*self.distance)
return agent_pos, goal_pos
def post_action_state(self, goal, dist):
if self.reward_typ == "shaped":
reward, done, dist_, first_person_obs, collide, agent_position, agent_pose = self.shaped_reward(goal=goal,
dist=dist)
else:
reward, done, dist_, first_person_obs, collide, agent_position, agent_pose = self.sparse_reward(goal=goal)
return reward, done, dist_, first_person_obs, collide, agent_position, agent_pose
def agent_goal_pos_not_equal(self, agent_pos, goal_pos):
new_random_goal_position = goal_pos