|
43 | 43 | #define ND_TD3_POLICY_MAX_PER_ACTION_SIGMA ndBrainFloat(1.0f)
|
44 | 44 | #define ND_MAX_SAC_ENTROPY_COEFFICIENT ndBrainFloat (2.0e-5f)
|
45 | 45 |
|
46 |
| -#define ND_TD3_VARIANCE |
| 46 | +//#define ND_TD3_VARIANCE |
47 | 47 |
|
48 | 48 | ndBrainAgentDeterministicPolicyGradient_Trainer::HyperParameters::HyperParameters()
|
49 | 49 | {
|
@@ -381,6 +381,9 @@ void ndBrainAgentDeterministicPolicyGradient_Trainer::BuildCriticClass()
|
381 | 381 | layers.PushBack(new ndBrainLayerLinear(layers[layers.GetCount() - 1]->GetOutputSize(), m_parameters.m_hiddenLayersNumberOfNeurons));
|
382 | 382 | layers.PushBack(new ndBrainLayerActivationTanh(layers[layers.GetCount() - 1]->GetOutputSize()));
|
383 | 383 | layers.PushBack(new ndBrainLayerLinear(layers[layers.GetCount() - 1]->GetOutputSize(), 1));
|
| 384 | +#ifdef ND_TD3_VARIANCE |
| 385 | + layers.PushBack(new ndBrainLayerActivationRelu(layers[layers.GetCount() - 1]->GetOutputSize())); |
| 386 | +#endif |
384 | 387 |
|
385 | 388 | ndSharedPtr<ndBrain> critic(new ndBrain);
|
386 | 389 | for (ndInt32 i = 0; i < layers.GetCount(); ++i)
|
@@ -740,21 +743,6 @@ void ndBrainAgentDeterministicPolicyGradient_Trainer::LearnPolicyFunction()
|
740 | 743 | const ndBrain& brain = **m_policyTrainer->GetBrain();
|
741 | 744 | ndInt32 criticInputSize = brain.GetInputSize() + brain.GetOutputSize();
|
742 | 745 |
|
743 |
| -#ifdef ND_TD3_VARIANCE |
744 |
| - ndInt32 count = m_parameters.m_criticUpdatesCount * m_parameters.m_miniBatchSize; |
745 |
| - m_miniBatchIndexBuffer.SetCount(0); |
746 |
| - for (ndInt32 i = 0; i < count; ++i) |
747 |
| - { |
748 |
| - m_miniBatchIndexBuffer.PushBack(m_shuffleBuffer[m_shuffleBatchIndex]); |
749 |
| - m_shuffleBatchIndex++; |
750 |
| - if (m_shuffleBatchIndex >= m_shuffleBuffer.GetCount()) |
751 |
| - { |
752 |
| - m_shuffleBatchIndex = 0; |
753 |
| - m_shuffleBuffer.RandomShuffle(m_shuffleBuffer.GetCount()); |
754 |
| - } |
755 |
| - } |
756 |
| -#endif |
757 |
| - |
758 | 746 | m_actionBatch.SetCount(m_parameters.m_miniBatchSize * brain.GetOutputSize());
|
759 | 747 | m_obsevationsBatch.SetCount(m_parameters.m_miniBatchSize * brain.GetInputSize());
|
760 | 748 | m_policyGradientBatch.SetCount(m_parameters.m_miniBatchSize * brain.GetOutputSize());
|
@@ -788,19 +776,10 @@ void ndBrainAgentDeterministicPolicyGradient_Trainer::LearnPolicyFunction()
|
788 | 776 |
|
789 | 777 | for (ndInt32 i = 0; i < ndInt32(sizeof(m_criticTrainer) / sizeof(m_criticTrainer[0])); ++i)
|
790 | 778 | {
|
791 |
| - #ifdef ND_TD3_VARIANCE |
792 |
| - m_referenceCriticTrainer[i]->MakePrediction(m_criticObservationActionBatch); |
793 |
| - m_referenceCriticTrainer[i]->GetOutput(m_criticOutputGradients[i]); |
794 |
| - m_criticTrainer[i]->MakePrediction(m_criticObservationActionBatch); |
795 |
| - m_criticTrainer[i]->GetOutput(m_criticValue[i]); |
796 |
| - |
797 |
| - m_criticOutputGradients[i].Sub(m_criticValue[i]); |
798 |
| - m_criticOutputGradients[i].Scale(ndBrainFloat(-1.0f)); |
799 |
| - #else |
800 |
| - m_criticTrainer[i]->MakePrediction(m_criticObservationActionBatch); |
801 |
| - m_criticTrainer[i]->GetOutput(m_criticValue[i]); |
802 |
| - m_criticOutputGradients[i].Set(ndBrainFloat(1.0f)); |
803 |
| - #endif |
| 779 | + m_criticTrainer[i]->MakePrediction(m_criticObservationActionBatch); |
| 780 | + m_criticTrainer[i]->GetOutput(m_criticValue[i]); |
| 781 | + m_criticOutputGradients[i].Set(ndBrainFloat(1.0f)); |
| 782 | + |
804 | 783 | if (m_parameters.m_entropyRegularizerCoef > ndBrainFloat(1.0e-6f))
|
805 | 784 | {
|
806 | 785 | ndAssert(0);
|
|
0 commit comments