reinforcement.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. """Reinforcement learning policy classes implementations for Operator Selection Strategy
  2. """
  3. # main imports
  4. import logging
  5. import random
  6. import math
  7. import numpy as np
  8. # module imports
  9. from .base import Policy
  10. class UCBPolicy(Policy):
  11. """Upper Confidence Bound (UCB) policy class which is used for applying UCB strategy when selecting and applying operator
  12. Rather than performing exploration by simply selecting an arbitrary action, chosen with a probability that remains constant,
  13. the UCB algorithm changes its exploration-exploitation balance as it gathers more knowledge of the environment.
  14. It moves from being primarily focused on exploration, when actions that have been tried the least are preferred,
  15. to instead concentrate on exploitation, selecting the action with the highest estimated reward.
  16. - Resource link: https://banditalgs.com/2016/09/18/the-upper-confidence-bound-algorithm/
  17. Attributes:
  18. operators: {[Operator]} -- list of selected operators for the algorithm
  19. C: {float} -- The second half of the UCB equation adds exploration, with the degree of exploration being controlled by the hyper-parameter `C`.
  20. exp_rate: {float} -- exploration rate (probability to choose randomly next operator)
  21. rewards: {[float]} -- list of summed rewards obtained for each operator
  22. occurrences: {[int]} -- number of use (selected) of each operator
  23. Example:
  24. >>> # operators import
  25. >>> from macop.operators.discrete.crossovers import SimpleCrossover
  26. >>> from macop.operators.discrete.mutators import SimpleMutation
  27. >>> # policy import
  28. >>> from macop.policies.reinforcement import UCBPolicy
  29. >>> # solution and algorithm
  30. >>> from macop.solutions.discrete import BinarySolution
  31. >>> from macop.algorithms.mono import IteratedLocalSearch
  32. >>> from macop.algorithms.mono import HillClimberFirstImprovment
  33. >>> # evaluator import
  34. >>> from macop.evaluators.discrete.mono import KnapsackEvaluator
  35. >>> # evaluator initialization (worths objects passed into data)
  36. >>> worths = [ random.randint(0, 20) for i in range(20) ]
  37. >>> evaluator = KnapsackEvaluator(data={'worths': worths})
  38. >>> # validator specification (based on weights of each objects)
  39. >>> weights = [ random.randint(5, 30) for i in range(20) ]
  40. >>> validator = lambda solution: True if sum([weights[i] for i, value in enumerate(solution._data) if value == 1]) < 200 else False
  41. >>> # initializer function with lambda function
  42. >>> initializer = lambda x=20: BinarySolution.random(x, validator)
  43. >>> # operators list with crossover and mutation
  44. >>> operators = [SimpleCrossover(), SimpleMutation()]
  45. >>> policy = UCBPolicy(operators)
  46. >>> local_search = HillClimberFirstImprovment(initializer, evaluator, operators, policy, validator, maximise=True, verbose=False)
  47. >>> algo = IteratedLocalSearch(initializer, evaluator, operators, policy, validator, localSearch=local_search, maximise=True, verbose=False)
  48. >>> policy._occurences
  49. [0, 0]
  50. >>> solution = algo.run(100)
  51. >>> type(solution).__name__
  52. 'BinarySolution'
  53. >>> policy._occurences # one more due to first evaluation
  54. [51, 53]
  55. """
  56. def __init__(self, operators, C=100., exp_rate=0.5):
  57. self._operators = operators
  58. self._rewards = [0. for o in self._operators]
  59. self._occurences = [0 for o in self._operators]
  60. self._C = C
  61. self._exp_rate = exp_rate
  62. def select(self):
  63. """Select using Upper Confidence Bound the next operator to use (using acquired rewards)
  64. Returns:
  65. {Operator}: the selected operator
  66. """
  67. indices = [i for i, o in enumerate(self._occurences) if o == 0]
  68. # random choice following exploration rate
  69. if np.random.uniform(0, 1) <= self._exp_rate:
  70. index = random.choice(range(len(self._operators)))
  71. return self._operators[index]
  72. elif len(indices) == 0:
  73. # if operator have at least be used one time
  74. ucbValues = []
  75. nVisits = sum(self._occurences)
  76. for i in range(len(self._operators)):
  77. ucbValue = self._rewards[i] + self._C * math.sqrt(
  78. math.log(nVisits) / (self._occurences[i] + 0.1))
  79. ucbValues.append(ucbValue)
  80. return self._operators[ucbValues.index(max(ucbValues))]
  81. else:
  82. return self._operators[random.choice(indices)]
  83. def apply(self, solution):
  84. """
  85. Apply specific operator chosen to create new solution, computes its fitness and returns solution
  86. - fitness improvment is saved as rewards
  87. - selected operator occurence is also increased
  88. Args:
  89. solution: {Solution} -- the solution to use for generating new solution
  90. Returns:
  91. {Solution} -- new generated solution
  92. """
  93. operator = self.select()
  94. logging.info("---- Applying %s on %s" %
  95. (type(operator).__name__, solution))
  96. # apply operator on solution
  97. newSolution = operator.apply(solution)
  98. # compute fitness of new solution
  99. newSolution.evaluate(self._algo._evaluator)
  100. # compute fitness improvment rate
  101. if self._algo._maximise:
  102. fir = (newSolution.fitness() -
  103. solution.fitness()) / solution.fitness()
  104. else:
  105. fir = (solution.fitness() -
  106. newSolution.fitness()) / solution.fitness()
  107. operator_index = self._operators.index(operator)
  108. if fir > 0:
  109. self._rewards[operator_index] += fir
  110. self._occurences[operator_index] += 1
  111. logging.info("---- Obtaining %s" % (solution))
  112. return newSolution