|
@@ -224,118 +224,7 @@ void remove_generator(monoid &__restrict__ dst,
|
|
// Shift block by decal uchar
|
|
// Shift block by decal uchar
|
|
block = shuffle_epi8(src.blocks[0], shift16[decal]);
|
|
block = shuffle_epi8(src.blocks[0], shift16[decal]);
|
|
dst.blocks[start_block] -= ((block != zero) & block1);
|
|
dst.blocks[start_block] -= ((block != zero) & block1);
|
|
-#if NBLOCKS >= 5
|
|
|
|
|
|
|
|
-#define CASE_UNROLL(i_loop) \
|
|
|
|
- case i_loop : \
|
|
|
|
- dst.blocks[i_loop+1] -= (load_unaligned_epi8(srcblock) != zero) & block1; \
|
|
|
|
- srcblock += sizeof(epi8);
|
|
|
|
-
|
|
|
|
- {
|
|
|
|
- const uint8_t *srcblock = src.decs + sizeof(epi8) - decal;
|
|
|
|
- switch(start_block)
|
|
|
|
- {
|
|
|
|
- CASE_UNROLL(0);
|
|
|
|
-#if NBLOCKS > 2
|
|
|
|
- CASE_UNROLL(1);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 3
|
|
|
|
- CASE_UNROLL(2);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 4
|
|
|
|
- CASE_UNROLL(3);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 5
|
|
|
|
- CASE_UNROLL(4);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 6
|
|
|
|
- CASE_UNROLL(5);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 7
|
|
|
|
- CASE_UNROLL(6);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 8
|
|
|
|
- CASE_UNROLL(7);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 9
|
|
|
|
- CASE_UNROLL(8);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 10
|
|
|
|
- CASE_UNROLL(9);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 11
|
|
|
|
- CASE_UNROLL(10);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 12
|
|
|
|
- CASE_UNROLL(11);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 13
|
|
|
|
- CASE_UNROLL(12);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 14
|
|
|
|
- CASE_UNROLL(13);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 15
|
|
|
|
- CASE_UNROLL(14);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 16
|
|
|
|
- CASE_UNROLL(15);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 17
|
|
|
|
- CASE_UNROLL(16);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 18
|
|
|
|
- CASE_UNROLL(17);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 19
|
|
|
|
- CASE_UNROLL(18);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 20
|
|
|
|
- CASE_UNROLL(19);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 21
|
|
|
|
- CASE_UNROLL(20);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 22
|
|
|
|
- CASE_UNROLL(21);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 23
|
|
|
|
- CASE_UNROLL(22);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 24
|
|
|
|
- CASE_UNROLL(23);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 25
|
|
|
|
- CASE_UNROLL(24);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 26
|
|
|
|
- CASE_UNROLL(25);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 27
|
|
|
|
- CASE_UNROLL(26);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 28
|
|
|
|
- CASE_UNROLL(27);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 29
|
|
|
|
- CASE_UNROLL(28);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 30
|
|
|
|
- CASE_UNROLL(29);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 31
|
|
|
|
- CASE_UNROLL(30);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 32
|
|
|
|
- CASE_UNROLL(31);
|
|
|
|
-#endif
|
|
|
|
-#if NBLOCKS > 33
|
|
|
|
-#error "Too many blocks"
|
|
|
|
-#endif
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
-#else
|
|
|
|
-#warning "Loop not unrolled"
|
|
|
|
|
|
|
|
for (auto i=start_block+1; i<NBLOCKS; i++)
|
|
for (auto i=start_block+1; i<NBLOCKS; i++)
|
|
{
|
|
{
|
|
@@ -344,7 +233,7 @@ void remove_generator(monoid &__restrict__ dst,
|
|
block = load_unaligned_epi8(src.decs + ((i-start_block)<<4) - decal);
|
|
block = load_unaligned_epi8(src.decs + ((i-start_block)<<4) - decal);
|
|
dst.blocks[i] -= ((block != zero) & block1);
|
|
dst.blocks[i] -= ((block != zero) & block1);
|
|
}
|
|
}
|
|
-#endif
|
|
|
|
|
|
+
|
|
|
|
|
|
assert(dst.decs[dst.conductor-1] == 0);
|
|
assert(dst.decs[dst.conductor-1] == 0);
|
|
}
|
|
}
|