Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
sjmonagi
Simulated visual based reinforcement learning for navigation with Hindsight experience Replay
Commits
25d1a552
Commit
25d1a552
authored
Apr 09, 2020
by
sjmonagi
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
images only
parent
7fb38af8
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
1644 additions
and
0 deletions
+1644
-0
Agent_shaped_reward_wiz_her_images_and_postions/DRQN_HER.py
Agent_shaped_reward_wiz_her_images_and_postions/DRQN_HER.py
+195
-0
Agent_shaped_reward_wiz_her_images_and_postions/DRQN_HER_Playing.py
...ed_reward_wiz_her_images_and_postions/DRQN_HER_Playing.py
+255
-0
Agent_shaped_reward_wiz_her_images_and_postions/DRQN_HER_Training.py
...d_reward_wiz_her_images_and_postions/DRQN_HER_Training.py
+194
-0
Agent_shaped_reward_wiz_her_images_and_postions/Environment.py
..._shaped_reward_wiz_her_images_and_postions/Environment.py
+209
-0
Agent_shaped_reward_wiz_her_images_and_postions/Environment_top_view.py
...eward_wiz_her_images_and_postions/Environment_top_view.py
+157
-0
Agent_shaped_reward_wiz_her_images_and_postions/Her_episodes_experiences.py
...d_wiz_her_images_and_postions/Her_episodes_experiences.py
+27
-0
Agent_shaped_reward_wiz_her_images_and_postions/autoencoder.py
..._shaped_reward_wiz_her_images_and_postions/autoencoder.py
+331
-0
Agent_shaped_reward_wiz_her_images_and_postions/experience_buffer.py
...d_reward_wiz_her_images_and_postions/experience_buffer.py
+99
-0
Agent_shaped_reward_wiz_her_images_and_postions/helper.py
Agent_shaped_reward_wiz_her_images_and_postions/helper.py
+177
-0
No files found.
Agent_shaped_reward_wiz_her_images_and_postions/DRQN_HER.py
0 → 100755
View file @
25d1a552
import
random
import
numpy
as
np
import
tensorflow
as
tf
random
.
seed
(
123
)
np
.
random
.
seed
(
123
)
class
DRQN
(
object
):
def
__init__
(
self
,
action_n
,
cell
,
scope
,
fcl_dims
,
save_path
,
input_size
,
nodes_num
,
gamma
=
0.98
):
self
.
cell
=
cell
self
.
scope
=
scope
self
.
gamma
=
gamma
self
.
fc1_dims
=
fcl_dims
self
.
action_n
=
action_n
self
.
save_path
=
save_path
self
.
nodes_num
=
nodes_num
with
tf
.
variable_scope
(
scope
):
# seperate agent observation, and positions
self
.
inputs
=
tf
.
placeholder
(
tf
.
float32
,
shape
=
(
None
,
515
),
name
=
"features_positions"
)
# additional goals
self
.
goals
=
tf
.
placeholder
(
tf
.
float32
,
shape
=
(
None
,
3
),
name
=
"Goals_"
)
# previous_action
self
.
pre_action
=
tf
.
placeholder
(
tf
.
int32
,
shape
=
(
None
,),
name
=
"pre_action"
)
# actions
self
.
actions
=
tf
.
placeholder
(
tf
.
int32
,
shape
=
(
None
,),
name
=
"actions"
)
# Q-targets-values
self
.
Q_values
=
tf
.
placeholder
(
tf
.
float32
,
shape
=
(
None
,),
name
=
"Targets_Q_Values"
)
self
.
pre_action_
=
tf
.
one_hot
(
self
.
pre_action
,
self
.
action_n
,
dtype
=
tf
.
float32
,
name
=
"pre_action_OneHot_enc"
)
lstm_input
=
tf
.
concat
((
self
.
inputs
,
self
.
goals
),
axis
=
1
)
lstm_input_
=
tf
.
concat
((
lstm_input
,
self
.
pre_action_
),
axis
=
1
)
with
tf
.
variable_scope
(
"RNN"
):
self
.
train_length
=
tf
.
placeholder
(
tf
.
int32
)
self
.
batch_size
=
tf
.
placeholder
(
tf
.
int32
,
shape
=
[])
self
.
input_flat
=
tf
.
reshape
(
tf
.
layers
.
flatten
(
lstm_input_
),
[
self
.
batch_size
,
self
.
train_length
,
input_size
])
# number_of_units may need to be changed
# self.cell = tf.contrib.rnn.BasicLSTMCell(num_units=self.nodes_num,
# state_is_tuple=True)
self
.
state_in
=
self
.
cell
.
zero_state
(
self
.
batch_size
,
tf
.
float32
)
self
.
rnn
,
self
.
rnn_state
=
tf
.
nn
.
dynamic_rnn
(
inputs
=
self
.
input_flat
,
cell
=
self
.
cell
,
dtype
=
tf
.
float32
,
initial_state
=
self
.
state_in
,
scope
=
scope
+
'_rnn'
)
self
.
rnn_flat
=
tf
.
reshape
(
self
.
rnn
,
shape
=
[
-
1
,
self
.
nodes_num
])
dense1
=
tf
.
layers
.
dense
(
self
.
rnn_flat
,
self
.
fc1_dims
,
activation
=
tf
.
nn
.
relu
,
trainable
=
True
)
# final output layer
self
.
predict_op
=
tf
.
layers
.
dense
(
dense1
,
action_n
,
trainable
=
True
)
actions_q_values
=
tf
.
reduce_sum
(
self
.
predict_op
*
tf
.
one_hot
(
self
.
actions
,
self
.
action_n
),
reduction_indices
=
[
1
])
# self.clipped_Q_values = tf.clip_by_value(self.Q_values, -1 / (1 - self.gamma), 0)
self
.
cost
=
tf
.
reduce_mean
(
tf
.
square
(
self
.
Q_values
-
actions_q_values
))
self
.
train_op
=
tf
.
train
.
AdamOptimizer
(
1e-3
).
minimize
(
self
.
cost
)
tf
.
summary
.
scalar
(
"Cost"
,
self
.
cost
)
tf
.
summary
.
histogram
(
"Goals"
,
self
.
goals
)
tf
.
summary
.
histogram
(
"Action_Q_values"
,
self
.
Q_values
)
tf
.
summary
.
histogram
(
"LSTM"
,
self
.
rnn
)
tf
.
summary
.
histogram
(
"LSTM_State"
,
self
.
rnn_state
)
self
.
merged
=
tf
.
summary
.
merge_all
()
def
hard_update_from
(
self
,
other
):
mine
=
[
t
for
t
in
tf
.
trainable_variables
()
if
t
.
name
.
startswith
(
self
.
scope
)]
mine
=
sorted
(
mine
,
key
=
lambda
v
:
v
.
name
)
theirs
=
[
t
for
t
in
tf
.
trainable_variables
()
if
t
.
name
.
startswith
(
other
.
scope
)]
theirs
=
sorted
(
theirs
,
key
=
lambda
v
:
v
.
name
)
self
.
session
.
run
([
v_t
.
assign
(
v
)
for
v_t
,
v
in
zip
(
mine
,
theirs
)])
def
soft_update_from
(
self
,
other
,
tau
=
0.95
):
mine
=
[
t
for
t
in
tf
.
trainable_variables
()
if
t
.
name
.
startswith
(
self
.
scope
)]
mine
=
sorted
(
mine
,
key
=
lambda
v
:
v
.
name
)
theirs
=
[
t
for
t
in
tf
.
trainable_variables
()
if
t
.
name
.
startswith
(
other
.
scope
)]
theirs
=
sorted
(
theirs
,
key
=
lambda
v
:
v
.
name
)
self
.
session
.
run
([
v_t
.
assign
(
v_t
*
(
1.
-
tau
)
+
v
*
tau
)
for
v_t
,
v
in
zip
(
mine
,
theirs
)])
def
set_session
(
self
,
session
):
self
.
session
=
session
def
predict
(
self
,
pos_obs_state
,
goals
,
batch_size
,
trace_length
,
rnn_state
,
pre_action
):
actions_q_values
,
rnn
,
rnn_state_
=
self
.
session
.
run
([
self
.
predict_op
,
self
.
rnn
,
self
.
rnn_state
],
feed_dict
=
{
self
.
goals
:
goals
,
self
.
state_in
:
rnn_state
,
self
.
inputs
:
pos_obs_state
,
self
.
batch_size
:
batch_size
,
self
.
pre_action
:
pre_action
,
self
.
train_length
:
trace_length
})
return
actions_q_values
,
rnn
,
rnn_state_
def
update
(
self
,
goals
,
states
,
actions
,
batch_size
,
q_values
,
trace_length
,
rnn_state
,
pre_action
):
self
.
c
,
_
=
self
.
session
.
run
([
self
.
cost
,
self
.
train_op
],
feed_dict
=
{
self
.
goals
:
goals
,
self
.
inputs
:
states
,
self
.
actions
:
actions
,
self
.
Q_values
:
q_values
,
self
.
state_in
:
rnn_state
,
self
.
batch_size
:
batch_size
,
self
.
pre_action
:
pre_action
,
self
.
train_length
:
trace_length
})
return
self
.
c
def
sample_action
(
self
,
goal
,
batch_size
,
trace_length
,
epsilon
,
rnn_state
,
pos_obs_state
,
pre_action
):
"""Implements epsilon greedy algorithm"""
if
np
.
random
.
random
()
<
epsilon
:
q_values
,
rnn
,
rnn_state_
=
self
.
predict
(
pos_obs_state
=
[
pos_obs_state
],
goals
=
[
goal
],
pre_action
=
[
pre_action
],
batch_size
=
batch_size
,
trace_length
=
trace_length
,
rnn_state
=
rnn_state
)
action
=
np
.
random
.
randint
(
1
,
self
.
action_n
)
else
:
action_q_values
,
_
,
rnn_state_
=
self
.
predict
(
pos_obs_state
=
[
pos_obs_state
],
goals
=
[
goal
],
pre_action
=
[
pre_action
],
batch_size
=
batch_size
,
trace_length
=
trace_length
,
rnn_state
=
rnn_state
)
action
=
np
.
argmax
(
action_q_values
[
0
])
return
action
,
rnn_state_
def
load
(
self
):
self
.
saver
=
tf
.
train
.
Saver
(
tf
.
global_variables
())
load_was_success
=
True
try
:
save_dir
=
'/'
.
join
(
self
.
save_path
.
split
(
'/'
)[:
-
1
])
ckpt
=
tf
.
train
.
get_checkpoint_state
(
save_dir
)
load_path
=
ckpt
.
model_checkpoint_path
self
.
saver
.
restore
(
self
.
session
,
load_path
)
except
:
print
(
"no saved model to load. starting new session"
)
load_was_success
=
False
else
:
print
(
"loaded model: {}"
.
format
(
load_path
))
saver
=
tf
.
train
.
Saver
(
tf
.
global_variables
())
episode_number
=
int
(
load_path
.
split
(
'-'
)[
-
1
])
def
save
(
self
,
n
):
self
.
saver
.
save
(
self
.
session
,
self
.
save_path
,
global_step
=
n
)
print
(
"SAVED MODEL #{}"
.
format
(
n
))
def
optimize
(
self
,
model
,
target_model
,
batch_size
,
trace_length
,
her_buffer
,
optimization_steps
):
losses
=
0
for
_
in
range
(
optimization_steps
):
rnn_stat_train
=
(
np
.
zeros
([
batch_size
,
self
.
nodes_num
]),
np
.
zeros
([
batch_size
,
self
.
nodes_num
]))
train_batch
=
her_buffer
.
sample
(
batch_size
=
batch_size
,
trace_length
=
trace_length
)
pre_action
,
states
,
curr_actions
,
rewards
,
next_states
,
dones
,
goals
=
map
(
np
.
array
,
zip
(
*
train_batch
))
# Calculate targets
next_Qs
,
_
,
_
=
target_model
.
predict
(
goals
=
goals
,
pre_action
=
pre_action
,
pos_obs_state
=
next_states
,
rnn_state
=
rnn_stat_train
,
trace_length
=
trace_length
,
batch_size
=
batch_size
)
next_Q
=
np
.
amax
(
next_Qs
,
axis
=
1
)
target_q_values
=
rewards
+
np
.
invert
(
dones
).
astype
(
np
.
float32
)
*
self
.
gamma
*
next_Q
# Calculate network loss
loss
=
model
.
update
(
goals
=
goals
,
states
=
states
,
actions
=
curr_actions
,
pre_action
=
pre_action
,
rnn_state
=
rnn_stat_train
,
q_values
=
target_q_values
,
trace_length
=
trace_length
,
batch_size
=
batch_size
)
losses
+=
loss
return
losses
/
optimization_steps
Agent_shaped_reward_wiz_her_images_and_postions/DRQN_HER_Playing.py
0 → 100755
View file @
25d1a552
import
random
import
numpy
as
np
import
pandas
as
pd
import
tensorflow
as
tf
#matplotlib.use("TkAgg")
import
matplotlib.pyplot
as
plt
from
matplotlib.lines
import
Line2D
from
DRQN_HER
import
DRQN
from
Environment
import
Environment
from
Environment_top_view
import
Environment_topview
from
Her_episodes_experiences
import
her_buffer
from
autoencoder
import
load_autoencoder
from
experience_buffer
import
experience_buffer
from
helper
import
train_valid_env_sync
random
.
seed
(
123
)
np
.
random
.
seed
(
123
)
dir
=
"/previous_Action_modified_her_with_sequence/DRQN.ckpt"
##### environment_Variables
grid_size
=
0.18
# size of the agent step
top_view
=
True
# displaying top-view
distance_threshold
=
grid_size
*
3
# distance threshold to the goal
action_n
=
6
#number of allowed action
random_init_position
=
False
# Random initial positions only -- no change in the agent orientation
random_init_pose
=
True
# Random initial positions with random agent orientation
reward
=
"shaped"
# reward type "shaped","sparse"
######################### hyper-parameter
num_episodes
=
50100
her_strategy
=
"future"
her_samples
=
4
batch_size
=
32
trace_length
=
16
gamma
=
0.99
fcl_dims
=
512
nodes_num
=
256
optimistion_steps
=
40
epsilon_max
=
1
epsilon_min
=
0.05
epsilon_decay
=
epsilon_max
-
((
epsilon_max
-
epsilon_min
)
/
20000
)
plotted_data
=
pd
.
DataFrame
(
columns
=
[
"Episodes"
,
"Successful trajectories"
,
"Failed trajectories"
,
"Ratio"
,
"loss"
,
"epsilon"
])
legend_elements
=
[
Line2D
([
0
],
[
0
],
marker
=
"o"
,
color
=
"white"
,
label
=
"Navigable Positions"
,
markerfacecolor
=
"grey"
,
markersize
=
10
),
Line2D
([
0
],
[
0
],
marker
=
"X"
,
color
=
"white"
,
label
=
"Goal Positions"
,
markerfacecolor
=
"grey"
,
markersize
=
10
),
Line2D
([
0
],
[
0
],
marker
=
"o"
,
color
=
"white"
,
label
=
"Initial Agent Position"
,
markerfacecolor
=
"blue"
,
markersize
=
10
),
Line2D
([
0
],
[
0
],
marker
=
"v"
,
color
=
"white"
,
label
=
"Looking Right"
,
markerfacecolor
=
"red"
,
markersize
=
10
),
Line2D
([
0
],
[
0
],
marker
=
"^"
,
color
=
"white"
,
label
=
"Looking Left"
,
markerfacecolor
=
"grey"
,
markersize
=
10
),
Line2D
([
0
],
[
0
],
marker
=
"<"
,
color
=
"white"
,
label
=
"looking Back"
,
markerfacecolor
=
"red"
,
markersize
=
10
),
]
# experience replay parameters
her_rec_buffer
=
her_buffer
()
episode_buffer
=
experience_buffer
(
distance
=
distance_threshold
,
reward_typ
=
reward
,
her_samples
=
her_samples
,
her_strategy
=
her_strategy
)
env
=
Environment
(
random_init_position
=
random_init_position
,
random_init_pos_orient
=
random_init_pose
,
reward_typ
=
reward
,
distance
=
distance_threshold
,
random_goals
=
True
,
grid_size
=
grid_size
,
agent_mode
=
"bot"
)
if
top_view
:
envT
=
Environment_topview
(
grid_size
=
grid_size
,
agent_mode
=
"bot"
,
distance
=
distance_threshold
,
reward_typ
=
reward
)
positions
=
env
.
get_reachable_position
()
plt
.
ion
()
# Autoenconder
print
(
"Autoencoder"
)
ae_sess
,
ae
=
load_autoencoder
()
global_step
=
tf
.
Variable
(
0
,
name
=
"global_step"
,
trainable
=
False
)
loss
=
0
# main loop
print
(
"DQN_HER_Model"
)
drqn_graph
=
tf
.
Graph
()
model
=
DRQN
(
action_n
=
action_n
,
nodes_num
=
nodes_num
,
fcl_dims
=
fcl_dims
,
scope
=
"model"
,
save_path
=
dir
)
# target_model = DRQN(action_n=action_n, nodes_num=nodes_num, fcl_dims=fcl_dims, scope="target_model",
# save_path=dir)
with
tf
.
Session
()
as
sess
:
model
.
set_session
(
sess
)
# target_model.set_session(sess)
sess
.
run
(
tf
.
global_variables_initializer
())
model
.
load
()
start
=
global_step
.
eval
(
sess
)
successes
=
0
failures
=
0
epsilon
=
1
for
n
in
range
(
start
,
num_episodes
):
# rnn_init_state
rnn_state
=
(
np
.
zeros
([
1
,
nodes_num
]),
np
.
zeros
([
1
,
nodes_num
]))
# reset environment
obs_state
,
pos_state
,
goal
,
distance
,
pose
,
pre_action_idx
=
env
.
reset
()
if
top_view
:
# additional env top view for validation
agent_pos_top
,
pose_top
=
envT
.
reset
(
x_pos
=
pose
[
0
],
y_pos
=
pose
[
1
],
z_pos
=
pose
[
2
],
angle
=
pose
[
4
])
if
top_view
:
# validation the position of the agent from two diff environment_object
train_valid_env_sync
(
pose
,
pose_top
)
features
=
ae_sess
.
run
(
ae
.
feature_vector
,
feed_dict
=
{
ae
.
image
:
obs_state
[
None
,
:,
:,
:]})
features
=
np
.
squeeze
(
features
,
axis
=
0
)
obs_pos_state
=
np
.
concatenate
((
features
,
pos_state
),
axis
=
0
)
plt
.
close
()
plt
.
figure
()
plt
.
ion
()
for
pos
in
positions
:
plt
.
scatter
(
pos
[
0
],
pos
[
2
],
s
=
20
,
c
=
"grey"
,
marker
=
"o"
,
alpha
=
1
)
x_start
,
x_end
=
plt
.
xlim
()
y_start
,
y_end
=
plt
.
ylim
()
plt
.
xticks
(
np
.
arange
(
x_start
,
x_end
,
grid_size
),
rotation
=
90
)
plt
.
yticks
(
np
.
arange
((
y_start
-
grid_size
),
y_end
,
grid_size
))
plt
.
xlabel
(
"X-Coordinates"
)
plt
.
ylabel
(
"Y-Coordinates"
)
plt
.
legend
(
handles
=
legend_elements
,
loc
=
"upper left"
,
bbox_to_anchor
=
(
1
,
1
),
prop
=
{
"size"
:
6
})
plt
.
grid
()
plt
.
tight_layout
()
plt
.
scatter
(
pos_state
[
0
],
pos_state
[
2
],
c
=
"blue"
,
marker
=
"o"
)
plt
.
scatter
(
goal
[
0
],
goal
[
2
],
c
=
"green"
,
marker
=
"X"
)
plt
.
pause
(
0.9
)
if
pose
[
4
]
==
0
:
plt
.
scatter
(
pos_state
[
0
],
pos_state
[
2
],
c
=
"white"
,
marker
=
"s"
,
alpha
=
1
)
plt
.
scatter
(
pos_state
[
0
],
pos_state
[
2
],
c
=
"red"
,
marker
=
"v"
,
alpha
=
1
)
elif
pose
[
4
]
==
90
:
plt
.
scatter
(
pos_state
[
0
],
pos_state
[
2
],
c
=
"white"
,
marker
=
"s"
,
alpha
=
1
)
plt
.
scatter
(
pos_state
[
0
],
pos_state
[
2
],
c
=
"red"
,
marker
=
">"
,
alpha
=
1
)
elif
pose
[
4
]
==
180
:
plt
.
scatter
(
pos_state
[
0
],
pos_state
[
2
],
c
=
"white"
,
marker
=
"s"
,
alpha
=
1
)
plt
.
scatter
(
pos_state
[
0
],
pos_state
[
2
],
c
=
"red"
,
marker
=
"^"
,
alpha
=
1
)
elif
pose
[
4
]
==
270
:
plt
.
scatter
(
pos_state
[
0
],
pos_state
[
2
],
c
=
"white"
,
marker
=
"s"
,
alpha
=
1
)
plt
.
scatter
(
pos_state
[
0
],
pos_state
[
2
],
c
=
"red"
,
marker
=
"<"
,
alpha
=
1
)
plt
.
pause
(
0.01
)
done
=
False
while
not
done
:
curr_action_idx
,
rnn_state_
=
model
.
sample_action
(
goal
=
goal
,
batch_size
=
1
,
trace_length
=
1
,
epsilon
=
0
,
rnn_state
=
rnn_state
,
pos_obs_state
=
obs_pos_state
,
pre_action
=
pre_action_idx
)
obs_state_
,
pos_state_
,
distance_
,
done
,
reward
,
collision
,
pose_
=
env
.
step
(
curr_action_idx
,
goal
,
distance
)
if
top_view
:
# top view environment used for verification of the main environment
obsStateT
,
posStateT
,
distanceT
,
doneT
,
rewardT
,
collisionT
,
agentPoseT
=
envT
.
step
(
curr_action_idx
,
goal
,
distance
)
if
top_view
:
# validation the postion of the agent from two diff environment_object
train_valid_env_sync
(
pose_
,
agentPoseT
)
features_
=
ae_sess
.
run
(
ae
.
feature_vector
,
feed_dict
=
{
ae
.
image
:
obs_state_
[
None
,
:,
:,
:]})
features_
=
np
.
squeeze
(
features_
,
axis
=
0
)
obs_pos_state_
=
np
.
concatenate
((
features_
,
pos_state_
),
axis
=
0
)
if
pose
[
4
]
==
0
:
plt
.
scatter
(
pos_state
[
0
],
pos_state
[
2
],
c
=
"white"
,
marker
=
"s"
,
alpha
=
1
)
plt
.
scatter
(
pos_state
[
0
],
pos_state
[
2
],
c
=
"red"
,
marker
=
"v"
,
alpha
=
1
)
elif
pose
[
4
]
==
90
:
plt
.
scatter
(
pos_state
[
0
],
pos_state
[
2
],
c
=
"white"
,
marker
=
"s"
,
alpha
=
1
)
plt
.
scatter
(
pos_state
[
0
],
pos_state
[
2
],
c
=
"red"
,
marker
=
">"
,
alpha
=
1
)
elif
pose
[
4
]
==
180
:
plt
.
scatter
(
pos_state
[
0
],
pos_state
[
2
],
c
=
"white"
,
marker
=
"s"
,
alpha
=
1
)
plt
.
scatter
(
pos_state
[
0
],
pos_state
[
2
],
c
=
"red"
,
marker
=
"^"
,
alpha
=
1
)
elif
pose
[
4
]
==
270
:
plt
.
scatter
(
pos_state
[
0
],
pos_state
[
2
],
c
=
"white"
,
marker
=
"s"
,
alpha
=
1
)
plt
.
scatter
(
pos_state
[
0
],
pos_state
[
2
],
c
=
"red"
,
marker
=
"<"
,
alpha
=
1
)
plt
.
pause
(
0.01
)
# # append to episode buffer
# episode_buffer.add(np.reshape(
# np.array([pre_action_idx, obs_pos_state, curr_action_idx, reward, obs_pos_state_, done, goal]),
# [1, 7]))
rnn_state
=
rnn_state_
obs_pos_state
=
obs_pos_state_
distance
=
distance_
pre_action_idx
=
curr_action_idx
if
done
:
if
distance
<=
distance_threshold
:
successes
+=
done
else
:
failures
+=
done
break
# her_buffer = episode_buffer.her()
# her_rec_buffer.add(her_buffer)
# episode_buffer.clear()
#
# if n > 50 and n != 0:
# loss = model.optimize(model=model,
# batch_size=batch_size,
# trace_length=trace_length,
# target_model=target_model,
# her_buffer=her_rec_buffer,
# optimization_steps=optimistion_steps)
# if n % 100 == 0 and n > 0:
# print("--update model--")
# target_model.soft_update_from(model)
#
# # model.log(drqn_summary=drqn_summary, encoder_summary=ae_summary, step=start)
#
# epsilon = max(epsilon * epsilon_decay, epsilon_min)
plotted_data
=
plotted_data
.
append
({
"Episodes"
:
str
(
n
),
"Successful trajectories"
:
successes
/
(
n
+
1
),
"Failed trajectories"
:
failures
/
(
n
+
1
),
"Ratio"
:
(
successes
/
(
failures
+
1e-6
)),
"loss"
:
loss
,
"epsilon"
:
epsilon
},
ignore_index
=
True
)
#plotting_training_log(n, plotted_data, successes, failures, loss, goal, distance, pos_state, epsilon)
# global_step.assign(n).eval()
# # saving
# if n % 50 == 0 and n > 0:
# model.save(n)
Agent_shaped_reward_wiz_her_images_and_postions/DRQN_HER_Training.py
0 → 100755
View file @
25d1a552
import
random
import
numpy
as
np
import
pandas
as
pd
import
tensorflow
as
tf
from
DRQN_HER
import
DRQN
from
Environment
import
Environment
from
Environment_top_view
import
Environment_topview
from
Her_episodes_experiences
import
her_buffer
from
autoencoder
import
load_autoencoder
from
experience_buffer
import
experience_buffer
from
helper
import
plotting_training_log
,
train_valid_env_sync
,
validate
random
.
seed
(
123
)
np
.
random
.
seed
(
123
)
fields_name
=
[
"iteration"
,
"successes"
]
dir
=
"/home/nagi/Desktop/Master_project_final/DRQN_3_her_sparse_image_and_pos_F1/DRQN.ckpt"
##### environment_Variables
grid_size
=
0.18
# size of the agent step
top_view
=
True
# displaying top-view
distance_threshold
=
grid_size
*
2
# distance threshold to the goal
action_n
=
3
# number of allowed action
random_init_position
=
False
# Random initial positions only -- no change in the agent orientation
random_init_pose
=
True
# Random initial positions with random agent orientation
reward
=
"sparse"
# reward type "shaped","sparse"
######################### hyper-parameter
num_episodes
=
15001
her_samples
=
8
batch_size
=
32
trace_length
=
8
gamma
=
0.99
fcl_dims
=
512
nodes_num
=
256
optimistion_steps
=
40
epsilon_max
=
1
epsilon_min
=
0
input_size
=
521
## size of the input to the LSTM
epsilon_decay
=
epsilon_max
-
(
epsilon_max
/
3000
)
## pandas data-frame for plotting
plotted_data
=
pd
.
DataFrame
(
columns
=
[
"Episodes"
,
"Successful trajectories"
,
"Failed trajectories"
,
"Ratio"
,
"loss"
,
"epsilon"
,
"F1"
])
# experience replay parameters
her_rec_buffer
=
her_buffer
()
episode_buffer
=
experience_buffer
(
distance
=
distance_threshold
,
reward_typ
=
reward
,
her_samples
=
her_samples
)
env
=
Environment
(
random_init_position
=
random_init_position
,
random_init_pos_orient
=
random_init_pose
,
reward_typ
=
reward
,
distance
=
distance_threshold
,
random_goals
=
True
,
grid_size
=
grid_size
,
agent_mode
=
"bot"
)
if
top_view
:
envT
=
Environment_topview
(
grid_size
=
grid_size
,
agent_mode
=
"bot"
,
distance
=
distance_threshold
,
reward_typ
=
reward
)
# Autoenconder
print
(
"Autoencoder"
)
ae_sess
,
ae
=
load_autoencoder
()
global_step
=
tf
.
Variable
(
0
,
name
=
"global_step"
,
trainable
=
False
)
loss
=
0
# main loop
print
(
"DQN_HER_Model"
)
drqn_graph
=
tf
.
Graph
()
cell
=
tf
.
nn
.
rnn_cell
.
LSTMCell
(
num_units
=
nodes_num
,
state_is_tuple
=
True
)
cellT
=
tf
.
nn
.
rnn_cell
.
LSTMCell
(
num_units
=
nodes_num
,
state_is_tuple
=
True
)
model
=
DRQN
(
action_n
=
action_n
,
cell
=
cell
,
fcl_dims
=
fcl_dims
,
scope
=
"model"
,
save_path
=
dir
,
nodes_num
=
nodes_num
,
input_size
=
input_size
)
target_model
=
DRQN
(
action_n
=
action_n
,
cell
=
cellT
,
fcl_dims
=
fcl_dims
,
scope
=
"target_model"
,
save_path
=
dir
,
nodes_num
=
nodes_num
,
input_size
=
input_size
)
print
(
"##### Env with grid_size equals"
,
grid_size
,
"and"
,
reward
,
"reward ######"
)
with
tf
.
Session
()
as
sess
:
model
.
set_session
(
sess
)
target_model
.
set_session
(
sess
)
sess
.
run
(
tf
.
global_variables_initializer
())
model
.
load
()
start
=
global_step
.
eval
(
sess
)
successes
=
0
failures
=
0
epsilon
=
1
for
n
in
range
(
start
,
num_episodes
):
step_num
=
0
# rnn_init_state
rnn_state
=
(
np
.
zeros
([
1
,
nodes_num
]),
np
.
zeros
([
1
,
nodes_num
]))
# reset environment
obs_state
,
pos_state
,
goal
,
distance
,
pose
,
pre_action_idx
=
env
.
reset
()
if
top_view
:
# additional env top view for validation
agent_pos_top
,
pose_top
=
envT
.
reset
(
x_pos
=
pose
[
0
],
y_pos
=
pose
[
1
],
z_pos
=
pose
[
2
],
angle
=
pose
[
4
])
if
top_view
:
# validation the position of the agent from two diff environment_object
train_valid_env_sync
(
pose
,
pose_top
)