reinforcement.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. """Reinforcement learning policy classes implementations for Operator Selection Strategy
  2. """
  3. # main imports
  4. import logging
  5. import random
  6. import math
  7. import numpy as np
  8. # module imports
  9. from .base import Policy
  10. class UCBPolicy(Policy):
  11. """Upper Confidence Bound (UCB) policy class which is used for applying UCB strategy when selecting and applying operator
  12. Rather than performing exploration by simply selecting an arbitrary action, chosen with a probability that remains constant,
  13. the UCB algorithm changes its exploration-exploitation balance as it gathers more knowledge of the environment.
  14. It moves from being primarily focused on exploration, when actions that have been tried the least are preferred,
  15. to instead concentrate on exploitation, selecting the action with the highest estimated reward.
  16. Link: https://banditalgs.com/2016/09/18/the-upper-confidence-bound-algorithm/
  17. Attributes:
  18. operators: {[Operator]} -- list of selected operators for the algorithm
  19. C: {float} -- tradeoff between EvE parameter for UCB
  20. exp_rate: {float} -- exploration rate (probability to choose randomly next operator)
  21. rewards: {[float]} -- list of summed rewards obtained for each operator
  22. occurrences: {[int]} -- number of use (selected) of each operator
  23. """
  24. def __init__(self, operators, C=100., exp_rate=0.5):
  25. self._operators = operators
  26. self._rewards = [0. for o in self._operators]
  27. self._occurrences = [0 for o in self._operators]
  28. self._C = C
  29. self._exp_rate = exp_rate
  30. def select(self):
  31. """Select randomly the next operator to use
  32. Returns:
  33. {Operator}: the selected operator
  34. """
  35. indices = [i for i, o in enumerate(self._occurrences) if o == 0]
  36. # random choice following exploration rate
  37. if np.random.uniform(0, 1) <= self._exp_rate:
  38. index = random.choice(range(len(self._operators)))
  39. return self._operators[index]
  40. elif len(indices) == 0:
  41. # if operator have at least be used one time
  42. ucbValues = []
  43. nVisits = sum(self._occurrences)
  44. for i in range(len(self._operators)):
  45. ucbValue = self._rewards[i] + self._C * math.sqrt(
  46. math.log(nVisits) / (self._occurrences[i] + 0.1))
  47. ucbValues.append(ucbValue)
  48. return self._operators[ucbValues.index(max(ucbValues))]
  49. else:
  50. return self._operators[random.choice(indices)]
  51. def apply(self, solution):
  52. """
  53. Apply specific operator chosen to create new solution, computes its fitness and returns solution
  54. Args:
  55. solution: {Solution} -- the solution to use for generating new solution
  56. Returns:
  57. {Solution} -- new generated solution
  58. """
  59. operator = self.select()
  60. logging.info("---- Applying %s on %s" %
  61. (type(operator).__name__, solution))
  62. # apply operator on solution
  63. newSolution = operator.apply(solution)
  64. # compute fitness of new solution
  65. newSolution.evaluate(self._algo._evaluator)
  66. # compute fitness improvment rate
  67. if self._algo._maximise:
  68. fir = (newSolution.fitness() -
  69. solution.fitness()) / solution.fitness()
  70. else:
  71. fir = (solution.fitness() -
  72. newSolution.fitness()) / solution.fitness()
  73. operator_index = self._operators.index(operator)
  74. if fir > 0:
  75. self._rewards[operator_index] += fir
  76. self._occurrences[operator_index] += 1
  77. logging.info("---- Obtaining %s" % (solution))
  78. return newSolution