#ifndef SEMIGROUP_HPP #define SEMIGROUP_HPP #include #include // We cant have those as C++ constant because of the #define used below in // remove_generator manual loop unrolling. I don't know if the unrolling is // doable using template metaprogamming. I didn't manage to figure out how. #ifndef MAX_GENUS #error "Please define the MAX_GENUS macro" #endif #define SIZE_BOUND (3*(MAX_GENUS-1)) #define NBLOCKS ((SIZE_BOUND+15) >> 4) #define SIZE (NBLOCKS << 4) // const uint_fast8_t MAX_GENUS = 40; // const uint_fast8_t SIZE_BOUND = (3*(MAX_GENUS-1)); // const uint_fast8_t NBLOCKS = ((SIZE_BOUND+15) >> 4); // const uint_fast8_t SIZE = (NBLOCKS << 4); #include typedef uint8_t epi8 __attribute__ ((vector_size (16))); typedef uint8_t dec_numbers[SIZE] __attribute__ ((aligned (16))); typedef epi8 dec_blocks[NBLOCKS]; typedef uint_fast64_t ind_t; // The type used for array indexes using namespace std; struct Semigroup { union { dec_numbers decs; dec_blocks blocks; }; // Dont use char as they have to be promoted to 64 bits to do pointer arithmetic. ind_t conductor, min, genus,left_primitive,left,e,wilf; }; void init_full_N(Semigroup &); void init(Semigroup&,char,char,char,char*); void print_Semigroup(const Semigroup &); void print_Semigroup_gen(const Semigroup&); void print_epi8(epi8); void output(const Semigroup& m,fstream& f); void record(const Semigroup& S,fstream& f); inline void copy_blocks( dec_blocks &__restrict__ dst, const dec_blocks &__restrict__ src); inline void remove_generator(Semigroup &__restrict__ dst, const Semigroup &__restrict__ src, ind_t gen,ind_t pos); inline Semigroup remove_generator(const Semigroup &src, ind_t gen,ind_t pos); // typedef enum { ALL, CHILDREN } generator_type; class ALL {}; class CHILDREN {}; // template class generator_iter template class generator_iter{ private: const Semigroup &m; unsigned int mask; // movemask_epi8 returns a 32 bits values ind_t iblock, gen, bound; public: generator_iter(const Semigroup &mon); bool move_next(); uint8_t count(); inline ind_t get_gen() const {return gen; }; }; /////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////// //////////////// Implementation part here for inlining //////////////// /////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////// /* Note: for some reason the code using gcc vector variables is 2-3% faster than the code using gcc intrinsic instructions. Here are the results of two runs: data_mmx = [9.757, 9.774, 9.757, 9.761, 9.811, 9.819, 9.765, 9.888, 9.774, 9.771] data_epi8 = [9.592, 9.535, 9.657, 9.468, 9.460, 9.679, 9.458, 9.461, 9.629, 9.474] */ extern inline epi8 load_unaligned_epi8(const uint8_t *t) { return (epi8) _mm_loadu_si128((__m128i *) (t)); }; extern inline epi8 shuffle_epi8(epi8 b, epi8 sh) // Require SSE 3 { return (epi8) _mm_shuffle_epi8((__m128i) b, (__m128i) sh);} extern inline int movemask_epi8(epi8 b) { return _mm_movemask_epi8((__m128i) b);}; const epi8 zero = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; const epi8 block1 = zero + 1; const uint8_t m1 = 255; const epi8 shift16[16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}, {m1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14}, {m1,m1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13}, {m1,m1,m1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12}, {m1,m1,m1,m1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11}, {m1,m1,m1,m1,m1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10}, {m1,m1,m1,m1,m1,m1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {m1,m1,m1,m1,m1,m1,m1, 0, 1, 2, 3, 4, 5, 6, 7, 8}, {m1,m1,m1,m1,m1,m1,m1,m1, 0, 1, 2, 3, 4, 5, 6, 7}, {m1,m1,m1,m1,m1,m1,m1,m1,m1, 0, 1, 2, 3, 4, 5, 6}, {m1,m1,m1,m1,m1,m1,m1,m1,m1,m1, 0, 1, 2, 3, 4, 5}, {m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1, 0, 1, 2, 3, 4}, {m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1, 0, 1, 2, 3}, {m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1, 0, 1, 2}, {m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1, 0, 1}, {m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1, 0} }; const epi8 mask16[16] = { {m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1}, { 0,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1}, { 0, 0,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1}, { 0, 0, 0,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1}, { 0, 0, 0, 0,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1}, { 0, 0, 0, 0, 0,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1}, { 0, 0, 0, 0, 0, 0,m1,m1,m1,m1,m1,m1,m1,m1,m1,m1}, { 0, 0, 0, 0, 0, 0, 0,m1,m1,m1,m1,m1,m1,m1,m1,m1}, { 0, 0, 0, 0, 0, 0, 0, 0,m1,m1,m1,m1,m1,m1,m1,m1}, { 0, 0, 0, 0, 0, 0, 0, 0, 0,m1,m1,m1,m1,m1,m1,m1}, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,m1,m1,m1,m1,m1,m1}, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,m1,m1,m1,m1,m1}, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,m1,m1,m1,m1}, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,m1,m1,m1}, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,m1,m1}, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,m1} }; template<> inline generator_iter::generator_iter(const Semigroup &mon) : m(mon), bound((mon.conductor+mon.min+15) >> 4){ epi8 block; iblock = 0; block = m.blocks[0]; mask = movemask_epi8(block == block1); mask &= 0xFFFE; // 0 is not a generator gen = - 1; }; template<> inline generator_iter::generator_iter(const Semigroup &mon) : m(mon), bound((mon.conductor+mon.min+15) >> 4){ epi8 block; iblock = m.conductor >> 4; block = m.blocks[iblock] & mask16[m.conductor & 0xF]; mask = movemask_epi8(block == block1); gen = (iblock << 4) - 1; }; template inline uint8_t generator_iter::count(){ uint8_t nbr = _mm_popcnt_u32(mask); // popcnt returns a 8 bits value for (ind_t ib = iblock+1; ib < bound; ib++) nbr += _mm_popcnt_u32(movemask_epi8(m.blocks[ib] == block1)); return nbr; }; template inline bool generator_iter::move_next(){ while (!mask){ iblock++; if (iblock > bound) return false; gen = (iblock << 4) - 1; mask = movemask_epi8(m.blocks[iblock] == block1); } unsigned char shift = __bsfd (mask) + 1; // Bit Scan Forward gen += shift; mask >>= shift; return true; }; inline __attribute__((always_inline)) void copy_blocks(dec_blocks &dst, dec_blocks const &src){ for (ind_t i=0; i inline __attribute__((always_inline)) void remove_generator(Semigroup &__restrict__ dst, const Semigroup &__restrict__ src, ind_t gen, ind_t pos){ ind_t start_block, decal; epi8 block; assert(src.decs[gen] == 1); ind_t t=gen+1; dst.conductor = t; dst.genus = src.genus + 1; int delta; if(gen==src.min){ dst.min=gen+1; delta=1; } else{ dst.min=src.min; delta=(src.decs[gen+src.min]==2)?0:-1; } dst.e=src.e+delta; assert(dst.e==((gen==src.min)?src.e+1:((src.decs[gen+src.min]==2)?src.e:src.e-1))); ind_t k=gen-src.conductor; assert(dst.conductor==src.conductor+k+1); dst.left=src.left+k; dst.left_primitive=src.left_primitive+pos; //dst.wilf=dst.e*dst.left-dst.conductor;//src.wilf+delta*(src.left+k)-k-1; dst.wilf=src.wilf+delta*(src.left+k)+(src.e-1)*k-1; copy_blocks(dst.blocks, src.blocks); start_block = gen >> 4; decal = gen & 0xF; // Shift block by decal uchar block = shuffle_epi8(src.blocks[0], shift16[decal]); dst.blocks[start_block] -= ((block != zero) & block1); for (auto i=start_block+1; i