@@ -60,7 +60,7 @@ def train(self):
60
60
agent .start ()
61
61
62
62
while True :
63
- time .sleep (60 * 5 )
63
+ time .sleep (60 * 10 )
64
64
self .save_model ("./save_model/breakout_a3c" )
65
65
66
66
# approximate policy and value using Neural Network
@@ -71,7 +71,6 @@ def build_model(self):
71
71
input = Input (shape = self .state_size )
72
72
conv = Conv2D (16 , (8 , 8 ), strides = (4 , 4 ), activation = 'relu' )(input )
73
73
conv = Conv2D (32 , (4 , 4 ), strides = (2 , 2 ), activation = 'relu' )(conv )
74
- conv = Conv2D (32 , (3 , 3 ), strides = (1 , 1 ), activation = 'relu' )(conv )
75
74
conv = Flatten ()(conv )
76
75
fc = Dense (256 , activation = 'relu' )(conv )
77
76
policy = Dense (self .action_size , activation = 'softmax' )(fc )
@@ -80,8 +79,8 @@ def build_model(self):
80
79
actor = Model (inputs = input , outputs = policy )
81
80
critic = Model (inputs = input , outputs = value )
82
81
83
- actor .predict ( np . random . rand ( 1 , 84 , 84 , 4 ) )
84
- critic .predict ( np . random . rand ( 1 , 84 , 84 , 4 ) )
82
+ actor ._make_predict_function ( )
83
+ critic ._make_predict_function ( )
85
84
86
85
actor .summary ()
87
86
critic .summary ()
@@ -163,6 +162,8 @@ def __init__(self, action_size, state_size, model, sess, optimizer, discount_fac
163
162
164
163
self .states , self .actions , self .rewards = [],[],[]
165
164
165
+ self .local_actor , self .local_critic = self .build_localmodel ()
166
+
166
167
self .avg_p_max = 0
167
168
self .avg_loss = 0
168
169
@@ -209,6 +210,11 @@ def run(self):
209
210
elif action == 1 : real_action = 2
210
211
else : real_action = 3
211
212
213
+ if dead :
214
+ action = 0
215
+ real_action = 1
216
+ dead = False
217
+
212
218
next_observe , reward , done , info = env .step (real_action )
213
219
# pre-process the observation --> history
214
220
next_state = pre_processing (next_observe , observe )
@@ -232,13 +238,13 @@ def run(self):
232
238
if dead :
233
239
history = np .stack ((next_state , next_state , next_state , next_state ), axis = 2 )
234
240
history = np .reshape ([history ], (1 , 84 , 84 , 4 ))
235
- dead = False
236
241
else :
237
242
history = next_history
238
243
239
244
#
240
245
if self .t >= self .t_max or done :
241
- self .train_t (done )
246
+ self .train_model (done )
247
+ self .update_localmodel ()
242
248
self .t = 0
243
249
244
250
# if done, plot the score over episodes
@@ -271,7 +277,7 @@ def discount_rewards(self, rewards, done):
271
277
return discounted_rewards
272
278
273
279
# update policy network and value network every episode
274
- def train_t (self , done ):
280
+ def train_model (self , done ):
275
281
discounted_rewards = self .discount_rewards (self .rewards , done )
276
282
277
283
states = np .zeros ((len (self .states ), 84 , 84 , 4 ))
@@ -289,9 +295,36 @@ def train_t(self, done):
289
295
self .optimizer [1 ]([states , discounted_rewards ])
290
296
self .states , self .actions , self .rewards = [], [], []
291
297
298
+ def build_localmodel (self ):
299
+ input = Input (shape = self .state_size )
300
+ conv = Conv2D (16 , (8 , 8 ), strides = (4 , 4 ), activation = 'relu' )(input )
301
+ conv = Conv2D (32 , (4 , 4 ), strides = (2 , 2 ), activation = 'relu' )(conv )
302
+ conv = Flatten ()(conv )
303
+ fc = Dense (256 , activation = 'relu' )(conv )
304
+ policy = Dense (self .action_size , activation = 'softmax' )(fc )
305
+ value = Dense (1 , activation = 'linear' )(fc )
306
+
307
+ actor = Model (inputs = input , outputs = policy )
308
+ critic = Model (inputs = input , outputs = value )
309
+
310
+ actor ._make_predict_function ()
311
+ critic ._make_predict_function ()
312
+
313
+ actor .set_weights (self .actor .get_weights ())
314
+ critic .set_weights (self .critic .get_weights ())
315
+
316
+ actor .summary ()
317
+ critic .summary ()
318
+
319
+ return actor , critic
320
+
321
+ def update_localmodel (self ):
322
+ self .local_actor .set_weights (self .actor .get_weights ())
323
+ self .local_critic .set_weights (self .critic .get_weights ())
324
+
292
325
def get_action (self , history ):
293
326
history = np .float32 (history / 255. )
294
- policy = self .actor .predict (history )[0 ]
327
+ policy = self .local_actor .predict (history )[0 ]
295
328
296
329
policy = policy - np .finfo (np .float32 ).epsneg
297
330
0 commit comments