import numpy as np


		
#La meilleure direction est (l'une de) celle(s) qui
#contien(nen)t la plus grande valeur de Q
#dans le voisinage de la position pos

def bestAction(tabQ, pos):
  bestList = []
  mx   = max(tabQ[pos][0], tabQ[pos][1], tabQ[pos][2], tabQ[pos][3], tabQ[pos][4])
  for a in range(nbActions):
     if (tabQ[pos][a]==mx):
       bestList.append(a)
	   
  choice = np.random.randint(len(bestList))
  best =  bestList[choice]
  
  return best

#Fonction Q-learning

def updateQ(tabQ, alpha, gamma, nbIter, tauxExplor):
   for numiter in range(nbIter):
     start = np.random.randint(nbStates-1)
     pos = start
     while(pos!=obj):
       rnd = np.random.rand()
       if (rnd<tauxExplor):
         act = np.random.randint(nbActions)
       else:
         act = bestAction(tabQ,pos)
	   
       new_pos = tabStatesActions[pos][act]
       r = tabRewards[pos][act]
	   
       maxQ = max(tabQ[new_pos][0], tabQ[new_pos][1], tabQ[new_pos][2], tabQ[new_pos][3], tabQ[new_pos][4])
       tabQ[pos][act] =tabQ[pos][act]+alpha*(r+maxQ-tabQ[pos][act])
       pos = new_pos
	   
#Affichage des (meilleures) valeurs de la table Q-learning

def afficherTabQ(tabQ):
   print("Meilleure action en partant de chaque état --> meilleur nouvel état","\n")
   for nums in range(nbStates):
     numBestAction = np.argmax((tabQ[nums][0], tabQ[nums][1], tabQ[nums][2], tabQ[nums][3], tabQ[nums][4]))
     numBestNextState = tabStatesActions[nums][numBestAction]
     bestNextState = tabStates[numBestNextState]
     print(tabStates[nums], numBestAction," -> ",bestNextState)

#Programme principal
#Définition du modèle Etats/Actions/Récompenses

tabStates = ['P1', 'P2','P3','P4','P5','P6']

tabStatesActions = np.array([[0,0,0,0,0], [1,1,1,1,1], [2,2,2,2,2], [3,3,3,3,3], [4,4,4,4,4], [5,5,5,5,5]])
tabStatesActions[0][1]=2
tabStatesActions[1][1]=2
tabStatesActions[2][2]=3
tabStatesActions[3][4]=5
tabStatesActions[4][3]=3

tabRewards = np.array([[-10,-10,-10,-10,-10], [-10,-10,-10,-10,-10], [-10,-10,-10,-10,-10], [-10,-10,-10,-10,-10], [-10,-10,-10,-10,-10], [-10,-10,-10,-10,-10],])
tabRewards[0][1]=-1
tabRewards[1][1]=-1
tabRewards[2][2]=-1
tabRewards[3][4]=10
tabRewards[4][3]=-1

nbStates = len(tabStates)
nbActions = nbStates-1
tabQ = np.zeros([nbStates,nbActions])
obj = 5 #Correspond à l'état P6, càd là où il y a la nourriture

#Affichage des mouvements licites (pour vérification)

print("Affichage des mouvements licites (pour simple vérification) " ,"\n")
for nums in range(6):
  for numa in range(5):
     if (tabStatesActions[nums][numa]!=nums):
        nums1 = tabStatesActions[nums][numa]
        print(tabStates[nums],"->",tabStates[nums1])

#Application du Q-learning

updateQ(tabQ, 0.2, 0.9, 100, 0.4)
afficherTabQ(tabQ)