35#ifdef LZMA_CRC_X86_CLMUL_H
36# error crc_x86_clmul.h was included twice.
38#define LZMA_CRC_X86_CLMUL_H
44#elif defined(HAVE_CPUID_H)
54#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
55# define crc_attr_target \
56 __attribute__((__target__("ssse3,sse4.1,pclmul")))
58# define crc_attr_target
62#define MASK_L(in, mask, r) r = _mm_shuffle_epi8(in, mask)
64#define MASK_H(in, mask, r) \
65 r = _mm_shuffle_epi8(in, _mm_xor_si128(mask, vsign))
67#define MASK_LH(in, mask, low, high) \
68 MASK_L(in, mask, low); \
69 MASK_H(in, mask, high)
75crc_simd_body(
const uint8_t *
buf,
const size_t size, __m128i *v0, __m128i *v1,
76 const __m128i vfold16,
const __m128i initial_crc)
80 const __m128i vramp = _mm_setr_epi32(
81 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c);
86 const __m128i vsign = _mm_set1_epi8(-0x80);
101 const size_t skip_start = (size_t)((
uintptr_t)
buf & 15);
103 const __m128i *aligned_buf = (
const __m128i *)(
113 const size_t size2 = skip_start +
size;
120 const __m128i mask_start
121 = _mm_sub_epi8(vramp, _mm_set1_epi8((
char)skip_start));
122 const __m128i mask_end
123 = _mm_sub_epi8(vramp, _mm_set1_epi8((
char)skip_end));
128 const __m128i data0 = _mm_blendv_epi8(_mm_load_si128(aligned_buf),
129 _mm_setzero_si128(), mask_start);
134#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS
148 const __m128i mask_low = _mm_add_epi8(
149 vramp, _mm_set1_epi8((
char)(
size - 16)));
150 MASK_LH(initial_crc, mask_low, *v0, *v1);
159 MASK_L(data0, mask_end, v3);
163 const __m128i data1 = _mm_load_si128(aligned_buf);
170 MASK_H(data0, mask_end, v2);
171 MASK_L(data1, mask_end, v3);
172 *v0 = _mm_xor_si128(*v0, v2);
175 *v0 = _mm_xor_si128(*v0, v3);
176 *v1 = _mm_alignr_epi8(*v1, *v0, 8);
181 const __m128i data1 = _mm_load_si128(aligned_buf);
182 const __m128i *end = (
const __m128i*)(
183 (
const char *)aligned_buf - 16 + size2);
186 MASK_LH(initial_crc, mask_start, *v0, *v1);
187 *v0 = _mm_xor_si128(*v0, data0);
188 *v1 = _mm_xor_si128(*v1, data1);
190 while (aligned_buf < end) {
191 *v1 = _mm_xor_si128(*v1, _mm_clmulepi64_si128(
192 *v0, vfold16, 0x00));
193 *v0 = _mm_xor_si128(*v1, _mm_clmulepi64_si128(
194 *v0, vfold16, 0x11));
195 *v1 = _mm_load_si128(aligned_buf++);
198 if (aligned_buf != end) {
199 MASK_H(*v0, mask_end, v2);
200 MASK_L(*v0, mask_end, *v0);
201 MASK_L(*v1, mask_end, v3);
202 *v1 = _mm_or_si128(v2, v3);
205 *v1 = _mm_xor_si128(*v1, _mm_clmulepi64_si128(
206 *v0, vfold16, 0x00));
207 *v0 = _mm_xor_si128(*v1, _mm_clmulepi64_si128(
208 *v0, vfold16, 0x11));
209 *v1 = _mm_srli_si128(*v0, 8);
243#ifdef BUILDING_CRC32_CLMUL
248crc32_arch_optimized(
const uint8_t *
buf,
size_t size, uint32_t crc)
250#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS
257 const int64_t p = 0x1db710640;
258 const int64_t mu = 0x1f7011641;
259 const int64_t k5 = 0x163cd6124;
260 const int64_t k4 = 0x0ccaa009e;
261 const int64_t k3 = 0x1751997d0;
263 const __m128i vfold4 = _mm_set_epi64x(mu, p);
264 const __m128i vfold8 = _mm_set_epi64x(0, k5);
265 const __m128i vfold16 = _mm_set_epi64x(k4, k3);
269 crc_simd_body(
buf,
size, &v0, &v1, vfold16,
270 _mm_cvtsi32_si128((int32_t)~crc));
273 _mm_clmulepi64_si128(v0, vfold16, 0x10), v1);
274 v2 = _mm_shuffle_epi32(v1, 0xe7);
275 v0 = _mm_slli_epi64(v1, 32);
276 v0 = _mm_clmulepi64_si128(v0, vfold8, 0x00);
277 v0 = _mm_xor_si128(v0, v2);
278 v2 = _mm_clmulepi64_si128(v0, vfold4, 0x10);
279 v2 = _mm_clmulepi64_si128(v2, vfold4, 0x00);
280 v0 = _mm_xor_si128(v0, v2);
281 return ~(uint32_t)_mm_extract_epi32(v0, 2);
317#ifdef BUILDING_CRC64_CLMUL
330#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__) \
332# pragma optimize("g", off)
338crc64_arch_optimized(
const uint8_t *
buf,
size_t size, uint64_t crc)
340#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS
347 const uint64_t p = 0x92d8af2baf0e1e85;
348 const uint64_t mu = 0x9c3e466c172963d5;
349 const uint64_t k2 = 0xdabe95afc7875f40;
350 const uint64_t k1 = 0xe05dd497ca393ae4;
352 const __m128i vfold8 = _mm_set_epi64x((int64_t)p, (int64_t)mu);
353 const __m128i vfold16 = _mm_set_epi64x((int64_t)k2, (int64_t)k1);
357#if defined(__i386__) || defined(_M_IX86)
358 crc_simd_body(
buf,
size, &v0, &v1, vfold16,
359 _mm_set_epi64x(0, (int64_t)~crc));
363 crc_simd_body(
buf,
size, &v0, &v1, vfold16,
364 _mm_cvtsi64_si128((int64_t)~crc));
367 v1 = _mm_xor_si128(_mm_clmulepi64_si128(v0, vfold16, 0x10), v1);
368 v0 = _mm_clmulepi64_si128(v1, vfold8, 0x00);
369 v2 = _mm_clmulepi64_si128(v0, vfold8, 0x10);
370 v0 = _mm_xor_si128(_mm_xor_si128(v1, _mm_slli_si128(v0, 8)), v2);
372#if defined(__i386__) || defined(_M_IX86)
373 return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) |
374 (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2));
376 return ~(uint64_t)_mm_extract_epi64(v0, 1);
380#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__) \
382# pragma optimize("", on)
391is_arch_extension_supported(
void)
400#elif defined(HAVE_CPUID_H)
404 success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]);
408 :
"=a"(r[0]),
"=b"(r[1]),
"=c"(r[2]),
"=d"(r[3])
416 const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19);
417 return success && (r[2] & ecx_mask) == ecx_mask;
#define crc_attr_no_sanitize_address
Definition crc_common.h:47
#define MASK_LH(in, mask, low, high)
Definition crc_x86_clmul.h:67
#define MASK_H(in, mask, r)
Definition crc_x86_clmul.h:64
#define MASK_L(in, mask, r)
Definition crc_x86_clmul.h:62
#define crc_attr_target
Definition crc_x86_clmul.h:58
bool success
Definition run.py:727
char buf[N_BUF]
Definition spewG.c:36
size_t uintptr_t
Definition fuzzer.c:71
#define lzma_always_inline
Definition common.h:114