154 lines
7.2 KiB
HTML
154 lines
7.2 KiB
HTML
<!DOCTYPE html>
|
||
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:svg="http://www.w3.org/2000/svg" xmlns:x86="http://www.felixcloutier.com/x86"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><link rel="stylesheet" type="text/css" href="style.css"></link><title>VPDPBUSD
|
||
— Multiply and Add Unsigned and Signed Bytes</title></head><body><header><nav><ul><li><a href='index.html'>Index</a></li><li>December 2023</li></ul></nav></header><h1>VPDPBUSD
|
||
— Multiply and Add Unsigned and Signed Bytes</h1>
|
||
|
||
<table>
|
||
<tr>
|
||
<th>Opcode/Instruction</th>
|
||
<th>Op/En</th>
|
||
<th>64/32 bit Mode Support</th>
|
||
<th>CPUID Feature Flag</th>
|
||
<th>Description</th></tr>
|
||
<tr>
|
||
<td>VEX.128.66.0F38.W0 50 /r VPDPBUSD xmm1, xmm2, xmm3/m128</td>
|
||
<td>A</td>
|
||
<td>V/V</td>
|
||
<td>AVX-VNNI</td>
|
||
<td>Multiply groups of 4 pairs of signed bytes in xmm3/m128 with corresponding unsigned bytes of xmm2, summing those products and adding them to doubleword result in xmm1.</td></tr>
|
||
<tr>
|
||
<td>VEX.256.66.0F38.W0 50 /r VPDPBUSD ymm1, ymm2, ymm3/m256</td>
|
||
<td>A</td>
|
||
<td>V/V</td>
|
||
<td>AVX-VNNI</td>
|
||
<td>Multiply groups of 4 pairs of signed bytes in ymm3/m256 with corresponding unsigned bytes of ymm2, summing those products and adding them to doubleword result in ymm1.</td></tr>
|
||
<tr>
|
||
<td>EVEX.128.66.0F38.W0 50 /r VPDPBUSD xmm1{k1}{z}, xmm2, xmm3/m128/m32bcst</td>
|
||
<td>B</td>
|
||
<td>V/V</td>
|
||
<td>AVX512_VNNI AVX512VL</td>
|
||
<td>Multiply groups of 4 pairs of signed bytes in xmm3/m128/m32bcst with corresponding unsigned bytes of xmm2, summing those products and adding them to doubleword result in xmm1 under writemask k1.</td></tr>
|
||
<tr>
|
||
<td>EVEX.256.66.0F38.W0 50 /r VPDPBUSD ymm1{k1}{z}, ymm2, ymm3/m256/m32bcst</td>
|
||
<td>B</td>
|
||
<td>V/V</td>
|
||
<td>AVX512_VNNI AVX512VL</td>
|
||
<td>Multiply groups of 4 pairs of signed bytes in ymm3/m256/m32bcst with corresponding unsigned bytes of ymm2, summing those products and adding them to doubleword result in ymm1 under writemask k1.</td></tr>
|
||
<tr>
|
||
<td>EVEX.512.66.0F38.W0 50 /r VPDPBUSD zmm1{k1}{z}, zmm2, zmm3/m512/m32bcst</td>
|
||
<td>B</td>
|
||
<td>V/V</td>
|
||
<td>AVX512_VNNI</td>
|
||
<td>Multiply groups of 4 pairs of signed bytes in zmm3/m512/m32bcst with corresponding unsigned bytes of zmm2, summing those products and adding them to doubleword result in zmm1 under writemask k1.</td></tr></table>
|
||
<h2 id="instruction-operand-encoding">Instruction Operand Encoding<a class="anchor" href="#instruction-operand-encoding">
|
||
¶
|
||
</a></h2>
|
||
<table>
|
||
<tr>
|
||
<th>Op/En</th>
|
||
<th>Tuple</th>
|
||
<th>Operand 1</th>
|
||
<th>Operand 2</th>
|
||
<th>Operand 3</th>
|
||
<th>Operand 4</th></tr>
|
||
<tr>
|
||
<td>A</td>
|
||
<td>N/A</td>
|
||
<td>ModRM:reg (r, w)</td>
|
||
<td>VEX.vvvv (r)</td>
|
||
<td>ModRM:r/m (r)</td>
|
||
<td>N/A</td></tr>
|
||
<tr>
|
||
<td>B</td>
|
||
<td>Full</td>
|
||
<td>ModRM:reg (r, w)</td>
|
||
<td>EVEX.vvvv (r)</td>
|
||
<td>ModRM:r/m (r)</td>
|
||
<td>N/A</td></tr></table>
|
||
<h3 id="description">Description<a class="anchor" href="#description">
|
||
¶
|
||
</a></h3>
|
||
<p>Multiplies the individual unsigned bytes of the first source operand by the corresponding signed bytes of the second source operand, producing intermediate signed word results. The word results are then summed and accumulated in the destination dword element size operand.</p>
|
||
<p>This instruction supports memory fault suppression.</p>
|
||
<h3 id="operation">Operation<a class="anchor" href="#operation">
|
||
¶
|
||
</a></h3>
|
||
<h4 id="vpdpbusd-dest--src1--src2--vex-encoded-versions-">VPDPBUSD dest, src1, src2 (VEX encoded versions)<a class="anchor" href="#vpdpbusd-dest--src1--src2--vex-encoded-versions-">
|
||
¶
|
||
</a></h4>
|
||
<pre>VL=(128, 256)
|
||
KL=VL/32
|
||
ORIGDEST := DEST
|
||
FOR i := 0 TO KL-1:
|
||
// Extending to 16b
|
||
// src1extend := ZERO_EXTEND
|
||
// src2extend := SIGN_EXTEND
|
||
p1word := src1extend(SRC1.byte[4*i+0]) * src2extend(SRC2.byte[4*i+0])
|
||
p2word := src1extend(SRC1.byte[4*i+1]) * src2extend(SRC2.byte[4*i+1])
|
||
p3word := src1extend(SRC1.byte[4*i+2]) * src2extend(SRC2.byte[4*i+2])
|
||
p4word := src1extend(SRC1.byte[4*i+3]) * src2extend(SRC2.byte[4*i+3])
|
||
DEST.dword[i] := ORIGDEST.dword[i] + p1word + p2word + p3word + p4word
|
||
DEST[MAX_VL-1:VL] := 0
|
||
</pre>
|
||
<h4 id="vpdpbusd-dest--src1--src2--evex-encoded-versions-">VPDPBUSD dest, src1, src2 (EVEX encoded versions)<a class="anchor" href="#vpdpbusd-dest--src1--src2--evex-encoded-versions-">
|
||
¶
|
||
</a></h4>
|
||
<pre>(KL,VL)=(4,128), (8,256), (16,512)
|
||
ORIGDEST := DEST
|
||
FOR i := 0 TO KL-1:
|
||
IF k1[i] or *no writemask*:
|
||
// Byte elements of SRC1 are zero-extended to 16b and
|
||
// byte elements of SRC2 are sign extended to 16b before multiplication.
|
||
IF SRC2 is memory and EVEX.b == 1:
|
||
t := SRC2.dword[0]
|
||
ELSE:
|
||
t := SRC2.dword[i]
|
||
p1word := ZERO_EXTEND(SRC1.byte[4*i]) * SIGN_EXTEND(t.byte[0])
|
||
p2word := ZERO_EXTEND(SRC1.byte[4*i+1]) * SIGN_EXTEND(t.byte[1])
|
||
p3word := ZERO_EXTEND(SRC1.byte[4*i+2]) * SIGN_EXTEND(t.byte[2])
|
||
p4word := ZERO_EXTEND(SRC1.byte[4*i+3]) * SIGN_EXTEND(t.byte[3])
|
||
DEST.dword[i] := ORIGDEST.dword[i] + p1word + p2word + p3word + p4word
|
||
ELSE IF *zeroing*:
|
||
DEST.dword[i] := 0
|
||
ELSE: // Merge masking, dest element unchanged
|
||
DEST.dword[i] := ORIGDEST.dword[i]
|
||
DEST[MAX_VL-1:VL] := 0
|
||
</pre>
|
||
<h3 id="intel-c-c++-compiler-intrinsic-equivalent">Intel C/C++ Compiler Intrinsic Equivalent<a class="anchor" href="#intel-c-c++-compiler-intrinsic-equivalent">
|
||
¶
|
||
</a></h3>
|
||
<pre>VPDPBUSD __m128i _mm_dpbusd_avx_epi32(__m128i, __m128i, __m128i);
|
||
</pre>
|
||
<pre>VPDPBUSD __m128i _mm_dpbusd_epi32(__m128i, __m128i, __m128i);
|
||
</pre>
|
||
<pre>VPDPBUSD __m128i _mm_mask_dpbusd_epi32(__m128i, __mmask8, __m128i, __m128i);
|
||
</pre>
|
||
<pre>VPDPBUSD __m128i _mm_maskz_dpbusd_epi32(__mmask8, __m128i, __m128i, __m128i);
|
||
</pre>
|
||
<pre>VPDPBUSD __m256i _mm256_dpbusd_avx_epi32(__m256i, __m256i, __m256i);
|
||
</pre>
|
||
<pre>VPDPBUSD __m256i _mm256_dpbusd_epi32(__m256i, __m256i, __m256i);
|
||
</pre>
|
||
<pre>VPDPBUSD __m256i _mm256_mask_dpbusd_epi32(__m256i, __mmask8, __m256i, __m256i);
|
||
</pre>
|
||
<pre>VPDPBUSD __m256i _mm256_maskz_dpbusd_epi32(__mmask8, __m256i, __m256i, __m256i);
|
||
</pre>
|
||
<pre>VPDPBUSD __m512i _mm512_dpbusd_epi32(__m512i, __m512i, __m512i);
|
||
</pre>
|
||
<pre>VPDPBUSD __m512i _mm512_mask_dpbusd_epi32(__m512i, __mmask16, __m512i, __m512i);
|
||
</pre>
|
||
<pre>VPDPBUSD __m512i _mm512_maskz_dpbusd_epi32(__mmask16, __m512i, __m512i, __m512i);
|
||
</pre>
|
||
<h3 class="exceptions" id="simd-floating-point-exceptions">SIMD Floating-Point Exceptions<a class="anchor" href="#simd-floating-point-exceptions">
|
||
¶
|
||
</a></h3>
|
||
<p>None.</p>
|
||
<h3 class="exceptions" id="other-exceptions">Other Exceptions<a class="anchor" href="#other-exceptions">
|
||
¶
|
||
</a></h3>
|
||
<p>Non-EVEX-encoded instruction, see <span class="not-imported">Table 2-21</span>, “Type 4 Class Exception Conditions.”</p>
|
||
<p>EVEX-encoded instruction, see <span class="not-imported">Table 2-49</span>, “Type E4 Class Exception Conditions.”</p><footer><p>
|
||
This UNOFFICIAL, mechanically-separated, non-verified reference is provided for convenience, but it may be
|
||
inc<span style="opacity: 0.2">omp</span>lete or b<sub>r</sub>oke<sub>n</sub> in various obvious or non-obvious
|
||
ways. Refer to <a href="https://software.intel.com/en-us/download/intel-64-and-ia-32-architectures-sdm-combined-volumes-1-2a-2b-2c-2d-3a-3b-3c-3d-and-4">Intel® 64 and IA-32 Architectures Software Developer’s Manual</a> for anything serious.
|
||
</p></footer></body></html>
|