reinforcement.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. """Reinforcement learning policy classes implementations for Operator Selection Strategy
  2. """
  3. # main imports
  4. import logging
  5. import random
  6. import math
  7. import numpy as np
  8. # module imports
  9. from macop.policies.base import Policy
  10. from macop.operators.base import KindOperator
  11. class UCBPolicy(Policy):
  12. """Upper Confidence Bound (UCB) policy class which is used for applying UCB strategy when selecting and applying operator
  13. Rather than performing exploration by simply selecting an arbitrary action, chosen with a probability that remains constant,
  14. the UCB algorithm changes its exploration-exploitation balance as it gathers more knowledge of the environment.
  15. It moves from being primarily focused on exploration, when actions that have been tried the least are preferred,
  16. to instead concentrate on exploitation, selecting the action with the highest estimated reward.
  17. - Resource link: https://banditalgs.com/2016/09/18/the-upper-confidence-bound-algorithm/
  18. Attributes:
  19. operators: {[Operator]} -- list of selected operators for the algorithm
  20. C: {float} -- The second half of the UCB equation adds exploration, with the degree of exploration being controlled by the hyper-parameter ``C``.
  21. exp_rate: {float} -- exploration rate (probability to choose randomly next operator)
  22. rewards: {[float]} -- list of summed rewards obtained for each operator
  23. occurrences: {[int]} -- number of use (selected) of each operator
  24. The value of attribute ``C`` will allow us to specify whether we wish to exploit or explore further in relation to our earned rewards.
  25. A low value of ``C`` (e.g. 2) will allow more exploitation, while a high value of ``C`` (e.g. 1000) will allow exploration.
  26. The ``exp_rate`` variable avoids using an operator too much and allows to explore from time to time (especially if the variable ``C`` has a small value). Typical value for ``exp_rate`` can be 0.9.
  27. Example:
  28. >>> # operators import
  29. >>> from macop.operators.discrete.crossovers import SimpleCrossover
  30. >>> from macop.operators.discrete.mutators import SimpleMutation
  31. >>>
  32. >>> # policy import
  33. >>> from macop.policies.reinforcement import UCBPolicy
  34. >>>
  35. >>> # solution and algorithm
  36. >>> from macop.solutions.discrete import BinarySolution
  37. >>> from macop.algorithms.mono import IteratedLocalSearch
  38. >>> from macop.algorithms.mono import HillClimberFirstImprovment
  39. >>>
  40. >>> # evaluator import
  41. >>> from macop.evaluators.discrete.mono import KnapsackEvaluator
  42. >>> # evaluator initialization (worths objects passed into data)
  43. >>>
  44. >>> worths = [ random.randint(0, 20) for i in range(20) ]
  45. >>> evaluator = KnapsackEvaluator(data={'worths': worths})
  46. >>>
  47. >>> # validator specification (based on weights of each objects)
  48. >>> weights = [ random.randint(5, 30) for i in range(20) ]
  49. >>> validator = lambda solution: True if sum([weights[i] for i, value in enumerate(solution.getData()) if value == 1]) < 200 else False
  50. >>>
  51. >>> # initialiser function with lambda function
  52. >>> initialiser = lambda x=20: BinarySolution.random(x, validator)
  53. >>>
  54. >>> # operators list with crossover and mutation
  55. >>> operators = [SimpleCrossover(), SimpleMutation()]
  56. >>> policy = UCBPolicy(operators)
  57. >>> local_search = HillClimberFirstImprovment(initialiser, evaluator, operators, policy, validator, maximise=True, verbose=False)
  58. >>> algo = IteratedLocalSearch(initialiser, evaluator, operators, policy, validator, localSearch=local_search, maximise=True, verbose=False)
  59. >>> policy.occurences
  60. [0, 0]
  61. >>> solution = algo.run(100)
  62. >>> type(solution).__name__
  63. 'BinarySolution'
  64. >>> policy.occurences # one more due to first evaluation
  65. [51, 53]
  66. """
  67. def __init__(self, operators, C=100., exp_rate=0.9):
  68. """UCB Policy initialiser
  69. Args:
  70. operators: {[Operator]} -- list of selected operators for the algorithm
  71. C: {float} -- The second half of the UCB equation adds exploration, with the degree of exploration being controlled by the hyper-parameter `C`.
  72. exp_rate: {float} -- exploration rate (probability to choose randomly next operator)
  73. """
  74. # private members
  75. self._operators = operators
  76. self._C = C
  77. self._exp_rate = exp_rate
  78. # public members
  79. self.rewards = [0. for o in self._operators]
  80. self.occurences = [0 for o in self._operators]
  81. def select(self):
  82. """Select using Upper Confidence Bound the next operator to use (using acquired rewards)
  83. Returns:
  84. {Operator}: the selected operator
  85. """
  86. indices = [i for i, o in enumerate(self.occurences) if o == 0]
  87. # random choice following exploration rate
  88. if np.random.uniform(0, 1) <= self._exp_rate:
  89. index = random.choice(range(len(self._operators)))
  90. return self._operators[index]
  91. elif len(indices) == 0:
  92. # if operator have at least be used one time
  93. ucbValues = []
  94. nVisits = sum(self.occurences)
  95. for i in range(len(self._operators)):
  96. ucbValue = self.rewards[i] + self._C * math.sqrt(
  97. math.log(nVisits) / (self.occurences[i] + 0.1))
  98. ucbValues.append(ucbValue)
  99. return self._operators[ucbValues.index(max(ucbValues))]
  100. else:
  101. return self._operators[random.choice(indices)]
  102. def apply(self, solution1, solution2=None):
  103. """
  104. Apply specific operator chosen to create new solution, computes its fitness and returns solution
  105. - fitness improvment is saved as rewards
  106. - selected operator occurence is also increased
  107. Args:
  108. solution1: {Solution} -- the first solution to use for generating new solution
  109. solution2: {Solution} -- the second solution to use for generating new solution (in case of specific crossover, default is best solution from algorithm)
  110. Returns:
  111. {Solution} -- new generated solution
  112. """
  113. operator = self.select()
  114. logging.info("---- Applying %s on %s" %
  115. (type(operator).__name__, solution1))
  116. # default value of solution2 is current best solution
  117. if solution2 is None and self._algo is not None:
  118. solution2 = self._algo.getResult()
  119. # avoid use of crossover if only one solution is passed
  120. if solution2 is None and operator._kind == KindOperator.CROSSOVER:
  121. while operator._kind == KindOperator.CROSSOVER:
  122. operator = self.select()
  123. # apply operator on solution
  124. if operator._kind == KindOperator.CROSSOVER:
  125. newSolution = operator.apply(solution1, solution2)
  126. else:
  127. newSolution = operator.apply(solution1)
  128. # compute fitness of new solution
  129. newSolution.evaluate(self._algo.evaluator)
  130. # compute fitness improvment rate
  131. if self._algo._maximise:
  132. fir = (newSolution.fitness() -
  133. solution1.fitness()) / solution1.fitness()
  134. else:
  135. fir = (solution1.fitness() -
  136. newSolution.fitness()) / solution1.fitness()
  137. operator_index = self._operators.index(operator)
  138. if fir > 0:
  139. self.rewards[operator_index] += fir
  140. self.occurences[operator_index] += 1
  141. logging.info("---- Obtaining %s" % (newSolution))
  142. return newSolution