"""Reinforcement learning policy classes implementations for Operator Selection Strategy
"""
# main imports
import logging
import random
import math
import numpy as np

# module imports
from .base import Policy


class UCBPolicy(Policy):
    """Upper Confidence Bound (UCB) policy class which is used for applying UCB strategy when selecting and applying operator 

    Rather than performing exploration by simply selecting an arbitrary action, chosen with a probability that remains constant, 
    the UCB algorithm changes its exploration-exploitation balance as it gathers more knowledge of the environment. 
    It moves from being primarily focused on exploration, when actions that have been tried the least are preferred, 
    to instead concentrate on exploitation, selecting the action with the highest estimated reward.

    Link: https://banditalgs.com/2016/09/18/the-upper-confidence-bound-algorithm/
    
    Attributes:
        operators: {[Operator]} -- list of selected operators for the algorithm
        C: {float} -- tradeoff between EvE parameter for UCB
        exp_rate: {float} -- exploration rate (probability to choose randomly next operator)
        rewards: {[float]} -- list of summed rewards obtained for each operator
        occurrences: {[int]} -- number of use (selected) of each operator
    """
    def __init__(self, operators, C=100., exp_rate=0.5):
        self._operators = operators
        self._rewards = [0. for o in self._operators]
        self._occurrences = [0 for o in self._operators]
        self._C = C
        self._exp_rate = exp_rate

    def select(self):
        """Select randomly the next operator to use

        Returns:
            {Operator}: the selected operator
        """

        indices = [i for i, o in enumerate(self._occurrences) if o == 0]

        # random choice following exploration rate
        if np.random.uniform(0, 1) <= self._exp_rate:

            index = random.choice(range(len(self._operators)))
            return self._operators[index]

        elif len(indices) == 0:

            # if operator have at least be used one time
            ucbValues = []
            nVisits = sum(self._occurrences)

            for i in range(len(self._operators)):

                ucbValue = self._rewards[i] + self._C * math.sqrt(
                    math.log(nVisits) / (self._occurrences[i] + 0.1))
                ucbValues.append(ucbValue)

            return self._operators[ucbValues.index(max(ucbValues))]

        else:
            return self._operators[random.choice(indices)]

    def apply(self, solution):
        """
        Apply specific operator chosen to create new solution, computes its fitness and returns solution
        
        Args:
            solution: {Solution} -- the solution to use for generating new solution

        Returns:
            {Solution} -- new generated solution
        """

        operator = self.select()

        logging.info("---- Applying %s on %s" %
                     (type(operator).__name__, solution))

        # apply operator on solution
        newSolution = operator.apply(solution)

        # compute fitness of new solution
        newSolution.evaluate(self._algo._evaluator)

        # compute fitness improvment rate
        if self._algo._maximise:
            fir = (newSolution.fitness() -
                   solution.fitness()) / solution.fitness()
        else:
            fir = (solution.fitness() -
                   newSolution.fitness()) / solution.fitness()

        operator_index = self._operators.index(operator)

        if fir > 0:
            self._rewards[operator_index] += fir

        self._occurrences[operator_index] += 1

        logging.info("---- Obtaining %s" % (solution))

        return newSolution