ó
ÍÄiQc           @   sp   d  d l  m Z m Z m Z d  d l Z d  d l Z d  d l Z d e f d „  ƒ  YZ d e f d „  ƒ  YZ d S(   iÿÿÿÿ(   t
   Directionst   Agentt   ActionsNt   ValueEstimationAgentc           B   sG   e  Z d  Z d d d d d „ Z d „  Z d „  Z d „  Z d	 „  Z RS(
   sX  
      Abstract agent which assigns values to (state,action)
      Q-Values for an environment. As well as a value to a
      state and a policy given respectively by,

      V(s) = max_{a in actions} Q(s,a)
      policy(s) = arg_max_{a in actions} Q(s,a)

      Both ValueIterationAgent and QLearningAgent inherit
      from this agent. While a ValueIterationAgent has
      a model of the environment via a MarkovDecisionProcess
      (see mdp.py) that is used to estimate Q-Values before
      ever actually acting, the QLearningAgent estimates
      Q-Values while acting in the environment.
    g      ð?gš™™™™™©?gš™™™™™é?i
   c         C   s@   t  | ƒ |  _ t  | ƒ |  _ t  | ƒ |  _ t | ƒ |  _ d S(   s/  
        Sets options, which can be passed in via the Pacman command line using -a alpha=0.5,...
        alpha    - learning rate
        epsilon  - exploration rate
        gamma    - discount factor
        numTraining - number of training episodes, i.e. no learning after these many episodes
        N(   t   floatt   alphat   epsilont   discountt   intt   numTraining(   t   selfR   R   t   gammaR	   (    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt   __init__    s    c         C   s   t  j ƒ  d S(   s/   
        Should return Q(state,action)
        N(   t   utilt   raiseNotDefined(   R
   t   statet   action(    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt	   getQValue0   s    c         C   s   t  j ƒ  d S(   s—   
        What is the value of this state under the best action?
        Concretely, this is given by

        V(s) = max_{a in actions} Q(s,a)
        N(   R   R   (   R
   R   (    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt   getValue6   s    c         C   s   t  j ƒ  d S(   sU  
        What is the best action to take in the state. Note that because
        we might want to explore, this might not coincide with getAction
        Concretely, this is given by

        policy(s) = arg_max_{a in actions} Q(s,a)

        If many actions achieve the maximal Q-value,
        it doesn't matter which is selected.
        N(   R   R   (   R
   R   (    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt	   getPolicy?   s    c         C   s   t  j ƒ  d S(   sa   
        state: can call state.getLegalActions()
        Choose an action and return it.
        N(   R   R   (   R
   R   (    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt	   getActionL   s    (   t   __name__t
   __module__t   __doc__R   R   R   R   R   (    (    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyR      s   				t   ReinforcementAgentc           B   s¤   e  Z d  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z	 d d d	 d	 d
 d „ Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z RS(   s;  
      Abstract Reinforcemnt Agent: A ValueEstimationAgent
            which estimates Q-Values (as well as policies) from experience
            rather than a model

        What you need to know:
                    - The environment will call
                      observeTransition(state,action,nextState,deltaReward),
                      which will call update(state, action, nextState, deltaReward)
                      which you should override.
        - Use self.getLegalActions(state) to know which actions
                      are available in a state
    c         C   s   t  j ƒ  d S(   s†   
                This class will call this function, which you write, after
                observing a transition and reward
        N(   R   R   (   R
   R   R   t	   nextStatet   reward(    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt   updatee   s    c         C   s   |  j  | ƒ S(   s”   
          Get the actions available for a given
          state. This is what you should use to
          obtain legal actions for a state
        (   t   actionFn(   R
   R   (    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt   getLegalActionsp   s    c         C   s)   |  j  | 7_  |  j | | | | ƒ d S(   só   
            Called by environment to inform agent that a transition has
            been observed. This will result in a call to self.update
            on the same arguments

            NOTE: Do *not* override or call this function
        N(   t   episodeRewardsR   (   R
   R   R   R   t   deltaReward(    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt   observeTransitionx   s    c         C   s   d |  _ d |  _ d |  _ d S(   sF   
          Called by environment when new episode is starting
        g        N(   t   Nonet	   lastStatet
   lastActionR   (   R
   (    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt   startEpisodeƒ   s    		c         C   ss   |  j  |  j k  r' |  j |  j 7_ n |  j |  j 7_ |  j  d 7_  |  j  |  j k ro d |  _ d |  _ n  d S(   s>   
          Called by environment when episode is done
        i   g        N(   t   episodesSoFarR	   t   accumTrainRewardsR   t   accumTestRewardsR   R   (   R
   (    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt   stopEpisode‹   s    	c         C   s   |  j  |  j k  S(   N(   R%   R	   (   R
   (    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt   isInTraining™   s    c         C   s   |  j  ƒ  S(   N(   R)   (   R
   (    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt   isInTestingœ   s    id   g      à?i   c         C   s|   | d k r d „  } n  | |  _ d |  _ d |  _ d |  _ t | ƒ |  _ t | ƒ |  _ t | ƒ |  _	 t | ƒ |  _
 d S(   s%  
        actionFn: Function which takes a state and returns the list of legal actions

        alpha    - learning rate
        epsilon  - exploration rate
        gamma    - discount factor
        numTraining - number of training episodes, i.e. no learning after these many episodes
        c         S   s
   |  j  ƒ  S(   N(   R   (   R   (    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt   <lambda>©   s    i    g        N(   R!   R   R%   R&   R'   R   R	   R   R   R   R   (   R
   R   R	   R   R   R   (    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyR   Ÿ   s    					c         C   s   | |  _  d  S(   N(   R   (   R
   R   (    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt
   setEpsilon¶   s    c         C   s   | |  _  d  S(   N(   R   (   R
   R   (    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt   setLearningRate¹   s    c         C   s   | |  _  d  S(   N(   R   (   R
   R   (    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt   setDiscount¼   s    c         C   s   | |  _  | |  _ d S(   s^   
            Called by inherited class when
            an action is taken in a state
        N(   R"   R#   (   R
   R   R   (    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt   doAction¿   s    	c         C   sK   |  j  d k	 rG | j ƒ  |  j  j ƒ  } |  j |  j  |  j | | ƒ n  | S(   s†   
            This is where we ended up after our last action.
            The simulation should somehow ensure this is called
        N(   R"   R!   t   getScoreR    R#   (   R
   R   R   (    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt   observationFunctionÊ   s    c         C   s,   |  j  ƒ  |  j d k r( d |  j GHn  d  S(   Ni    s!   Beginning %d episodes of Training(   R$   R%   R	   (   R
   R   (    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt   registerInitialStateÔ   s    
c         C   s»  | j  ƒ  |  j j  ƒ  } |  j |  j |  j | | ƒ |  j ƒ  d |  j k r` t j ƒ  |  _ n  d |  j k r{ d |  _ n  |  j | j  ƒ  7_ d } |  j	 | d k rƒd GH|  j t
 | ƒ } |  j	 |  j k r
|  j t
 |  j	 ƒ } d |  j	 |  j f GHd | GHn9 t
 |  j ƒ |  j	 |  j } d	 |  j	 |  j GHd
 | GHd | | f GHd t j ƒ  |  j GHd |  _ t j ƒ  |  _ n  |  j	 |  j k r·d } d | d t | ƒ f GHn  d S(   s?   
          Called by Pacman game at the terminal state
        t   episodeStartTimet   lastWindowAccumRewardsg        id   i    s   Reinforcement Learning Status:s)   	Completed %d out of %d training episodess(   	Average Rewards over all training: %.2fs   	Completed %d test episodess#   	Average Rewards over testing: %.2fs+   	Average Rewards for last %d episodes: %.2fs   	Episode took %.2f secondss-   Training Done (turning off epsilon and alpha)s   %s
%st   -N(   R0   R"   R    R#   R(   t   __dict__t   timeR3   R4   R%   R   R	   R&   R'   t   len(   R
   R   R   t   NUM_EPS_UPDATEt	   windowAvgt   trainAvgt   testAvgt   msg(    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt   finalÙ   s:    
			N(   R   R   R   R   R   R    R$   R(   R)   R*   R!   R   R,   R-   R.   R/   R1   R2   R>   (    (    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyR   S   s    												
	(	   t   gameR    R   R   t   randomR   R7   R   R   (    (    (    sX   C:\Users\Nicholas\Desktop\CS188X Berkeley\05 Projects\03 Reinforcement\learningAgents.pyt   <module>   s   $D