monoid.hpp 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293
  1. #ifndef MONOID_HPP
  2. #define MONOID_HPP
  3. #include <cstdint>
  4. #include <cmath>
  5. // We cant have those as C++ constant because of the #define used below in
  6. // remove_generator manual loop unrolling. I don't know if the unrolling is
  7. // doable using template metaprogamming. I didn't manage to figure out how.
  8. #ifndef MAX_GENUS
  9. #error "Please define the MAX_GENUS macro"
  10. #endif
  11. #define SIZE_BOUND (3*(MAX_GENUS-1))
  12. #define NBLOCKS ((SIZE_BOUND+15) >> 4)
  13. #define SIZE (NBLOCKS << 4)
  14. // const uint_fast8_t MAX_GENUS = 40;
  15. // const uint_fast8_t SIZE_BOUND = (3*(MAX_GENUS-1));
  16. // const uint_fast8_t NBLOCKS = ((SIZE_BOUND+15) >> 4);
  17. // const uint_fast8_t SIZE = (NBLOCKS << 4);
  18. #include <x86intrin.h>
  19. typedef uint8_t epi8 __attribute__ ((vector_size (16)));
  20. typedef uint8_t dec_numbers[SIZE] __attribute__ ((aligned (16)));
  21. typedef epi8 dec_blocks[NBLOCKS];
  22. typedef uint_fast64_t ind_t; // The type used for array indexes
  23. struct monoid
  24. {
  25. union {
  26. dec_numbers decs;
  27. dec_blocks blocks;
  28. };
  29. // Dont use char as they have to be promoted to 64 bits to do pointer arithmetic.
  30. ind_t conductor, min, genus;
  31. int quotient() const {return ceil(float(conductor)/float(min));};
  32. };
  33. void init_full_N(monoid &);
  34. void print_monoid(const monoid &);
  35. void print_epi8(epi8);
  36. inline void copy_blocks( dec_blocks &__restrict__ dst,
  37. const dec_blocks &__restrict__ src);
  38. inline void remove_generator(monoid &__restrict__ dst,
  39. const monoid &__restrict__ src,
  40. ind_t gen);
  41. inline monoid remove_generator(const monoid &src, ind_t gen);
  42. // typedef enum { ALL, CHILDREN } generator_type;
  43. class ALL {};
  44. class CHILDREN {};
  45. // template <generator_type T> class generator_iter
  46. template <class T> class generator_iter
  47. {
  48. private:
  49. const monoid &m;
  50. unsigned int mask; // movemask_epi8 returns a 32 bits values
  51. ind_t iblock, gen, bound;
  52. public:
  53. generator_iter(const monoid &mon);
  54. bool move_next();
  55. uint8_t count();
  56. inline ind_t get_gen() const {return gen; };
  57. };
  58. ///////////////////////////////////////////////////////////////////////////
  59. ///////////////////////////////////////////////////////////////////////////
  60. //////////////// Implementation part here for inlining ////////////////
  61. ///////////////////////////////////////////////////////////////////////////
  62. ///////////////////////////////////////////////////////////////////////////
  63. /*
  64. Note: for some reason the code using gcc vector variables is 2-3% faster than
  65. the code using gcc intrinsic instructions.
  66. Here are the results of two runs:
  67. data_mmx = [9.757, 9.774, 9.757, 9.761, 9.811, 9.819, 9.765, 9.888, 9.774, 9.771]
  68. data_epi8 = [9.592, 9.535, 9.657, 9.468, 9.460, 9.679, 9.458, 9.461, 9.629, 9.474]
  69. */
  70. extern inline epi8 load_unaligned_epi8(const uint8_t *t)
  71. { return (epi8) _mm_loadu_si128((__m128i *) (t)); };
  72. extern inline epi8 shuffle_epi8(epi8 b, epi8 sh) // Require SSE 3
  73. { return (epi8) _mm_shuffle_epi8((__m128i) b, (__m128i) sh);}
  74. extern inline int movemask_epi8(epi8 b)
  75. { return _mm_movemask_epi8((__m128i) b);};
  76. const epi8 zero = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
  77. const epi8 block1 = zero + 1;
  78. const uint8_t m1 = 255;
  79. const epi8 shift16[16] =
  80. { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15},
  81. {m1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14},
  82. {m1,m1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13},
  83. {m1,m1,m1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12},
  84. {m1,m1,m1,m1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11},
  85. {m1,m1,m1,m1,m1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10},
  86. {m1,m1,m1,m1,m1,m1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
  87. {m1,m1,m1,m1,m1,m1,m1, 0, 1, 2, 3, 4, 5, 6, 7, 8},
  88. {m1,m1,m1,m1,m1,m1,m1,m1, 0, 1, 2, 3, 4, 5, 6, 7},
  89. {m1,m1,m1,m1,m1,m1,m1,m1,m1, 0, 1, 2, 3, 4, 5, 6},
  90. {m1,m1,m1,m1,m1,m1,m1,m1,m1,m1, 0, 1, 2, 3, 4, 5},
  91. {m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1, 0, 1, 2, 3, 4},
  92. {m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1, 0, 1, 2, 3},
  93. {m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1, 0, 1, 2},
  94. {m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1, 0, 1},
  95. {m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1, 0} };
  96. const epi8 mask16[16] =
  97. { {m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1},
  98. { 0,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1},
  99. { 0, 0,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1},
  100. { 0, 0, 0,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1},
  101. { 0, 0, 0, 0,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1},
  102. { 0, 0, 0, 0, 0,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1},
  103. { 0, 0, 0, 0, 0, 0,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1},
  104. { 0, 0, 0, 0, 0, 0, 0,m1,m1,m1,m1,m1,m1,m1,m1,m1},
  105. { 0, 0, 0, 0, 0, 0, 0, 0,m1,m1,m1,m1,m1,m1,m1,m1},
  106. { 0, 0, 0, 0, 0, 0, 0, 0, 0,m1,m1,m1,m1,m1,m1,m1},
  107. { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,m1,m1,m1,m1,m1,m1},
  108. { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,m1,m1,m1,m1,m1},
  109. { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,m1,m1,m1,m1},
  110. { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,m1,m1,m1},
  111. { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,m1,m1},
  112. { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,m1} };
  113. template<> inline generator_iter<ALL>::generator_iter(const monoid &mon)
  114. : m(mon), bound((mon.conductor+mon.min+15) >> 4)
  115. {
  116. epi8 block;
  117. iblock = 0;
  118. block = m.blocks[0];
  119. mask = movemask_epi8(block == block1);
  120. mask &= 0xFFFE; // 0 is not a generator
  121. gen = - 1;
  122. };
  123. template<> inline generator_iter<CHILDREN>::generator_iter(const monoid &mon)
  124. : m(mon), bound((mon.conductor+mon.min+15) >> 4)
  125. {
  126. epi8 block;
  127. iblock = m.conductor >> 4;
  128. block = m.blocks[iblock] & mask16[m.conductor & 0xF];
  129. mask = movemask_epi8(block == block1);
  130. gen = (iblock << 4) - 1;
  131. };
  132. template <class T> inline uint8_t generator_iter<T>::count()
  133. {
  134. uint8_t nbr = _mm_popcnt_u32(mask); // popcnt returns a 8 bits value
  135. for (ind_t ib = iblock+1; ib < bound; ib++)
  136. nbr += _mm_popcnt_u32(movemask_epi8(m.blocks[ib] == block1));
  137. return nbr;
  138. };
  139. template <class T> inline bool generator_iter<T>::move_next()
  140. {
  141. while (!mask)
  142. {
  143. iblock++;
  144. if (iblock > bound) return false;
  145. gen = (iblock << 4) - 1;
  146. mask = movemask_epi8(m.blocks[iblock] == block1);
  147. }
  148. unsigned char shift = __bsfd (mask) + 1; // Bit Scan Forward
  149. gen += shift;
  150. mask >>= shift;
  151. return true;
  152. };
  153. inline __attribute__((always_inline))
  154. void copy_blocks(dec_blocks &dst, dec_blocks const &src)
  155. {
  156. for (ind_t i=0; i<NBLOCKS; i++) dst[i] = src[i];
  157. }
  158. #include <cassert>
  159. inline __attribute__((always_inline))
  160. void remove_generator(monoid &__restrict__ dst,
  161. const monoid &__restrict__ src,
  162. ind_t gen)
  163. {
  164. ind_t start_block, decal;
  165. epi8 block;
  166. assert(src.decs[gen] == 1);
  167. dst.conductor = gen + 1;
  168. dst.genus = src.genus + 1;
  169. dst.min = (gen == src.min) ? dst.conductor : src.min;
  170. copy_blocks(dst.blocks, src.blocks);
  171. start_block = gen >> 4;
  172. decal = gen & 0xF;
  173. // Shift block by decal uchar
  174. block = shuffle_epi8(src.blocks[0], shift16[decal]);
  175. dst.blocks[start_block] -= ((block != zero) & block1);
  176. #if NBLOCKS >= 5
  177. #define CASE_UNROLL(i_loop) \
  178. case i_loop : \
  179. dst.blocks[i_loop+1] -= (load_unaligned_epi8(srcblock) != zero) & block1; \
  180. srcblock += sizeof(epi8);
  181. {
  182. const uint8_t *srcblock = src.decs + sizeof(epi8) - decal;
  183. switch(start_block)
  184. {
  185. CASE_UNROLL(0);
  186. #if NBLOCKS > 2
  187. CASE_UNROLL(1);
  188. #endif
  189. #if NBLOCKS > 3
  190. CASE_UNROLL(2);
  191. #endif
  192. #if NBLOCKS > 4
  193. CASE_UNROLL(3);
  194. #endif
  195. #if NBLOCKS > 5
  196. CASE_UNROLL(4);
  197. #endif
  198. #if NBLOCKS > 6
  199. CASE_UNROLL(5);
  200. #endif
  201. #if NBLOCKS > 7
  202. CASE_UNROLL(6);
  203. #endif
  204. #if NBLOCKS > 8
  205. CASE_UNROLL(7);
  206. #endif
  207. #if NBLOCKS > 9
  208. CASE_UNROLL(8);
  209. #endif
  210. #if NBLOCKS > 10
  211. CASE_UNROLL(9);
  212. #endif
  213. #if NBLOCKS > 11
  214. CASE_UNROLL(10);
  215. #endif
  216. #if NBLOCKS > 12
  217. CASE_UNROLL(11);
  218. #endif
  219. #if NBLOCKS > 13
  220. CASE_UNROLL(12);
  221. #endif
  222. #if NBLOCKS > 14
  223. CASE_UNROLL(13);
  224. #endif
  225. #if NBLOCKS > 15
  226. CASE_UNROLL(14);
  227. #endif
  228. #if NBLOCKS > 16
  229. #error "Too many blocks"
  230. #endif
  231. }
  232. }
  233. #else
  234. #warning "Loop not unrolled"
  235. for (auto i=start_block+1; i<NBLOCKS; i++)
  236. {
  237. // The following won't work due to some alignment problem (bug in GCC 4.7.1?)
  238. // block = *((epi8*)(src.decs + ((i-start_block)<<4) - decal));
  239. block = load_unaligned_epi8(src.decs + ((i-start_block)<<4) - decal);
  240. dst.blocks[i] -= ((block != zero) & block1);
  241. }
  242. #endif
  243. assert(dst.decs[dst.conductor-1] == 0);
  244. }
  245. inline monoid remove_generator(const monoid &src, ind_t gen)
  246. {
  247. monoid dst;
  248. remove_generator(dst, src, gen);
  249. return dst;
  250. }
  251. #endif