cranelift_codegen/isa/x64/inst/
emit.rs

1use crate::binemit::{Addend, Reloc};
2use crate::ir::immediates::{Ieee32, Ieee64};
3use crate::ir::TrapCode;
4use crate::ir::{KnownSymbol, LibCall};
5use crate::isa::x64::encoding::evex::{EvexInstruction, EvexVectorLength};
6use crate::isa::x64::encoding::rex::{
7    emit_simm, emit_std_enc_enc, emit_std_enc_mem, emit_std_reg_mem, emit_std_reg_reg, int_reg_enc,
8    low8_will_sign_extend_to_32, low8_will_sign_extend_to_64, reg_enc, LegacyPrefixes, OpcodeMap,
9    RexFlags,
10};
11use crate::isa::x64::encoding::vex::{RegisterOrAmode, VexInstruction, VexVectorLength};
12use crate::isa::x64::inst::args::*;
13use crate::isa::x64::inst::*;
14use crate::machinst::{inst_common, MachBuffer, MachInstEmit, MachLabel, Reg, Writable};
15use core::convert::TryInto;
16
17/// A small helper to generate a signed conversion instruction.
18fn emit_signed_cvt(
19    sink: &mut MachBuffer<Inst>,
20    info: &EmitInfo,
21    state: &mut EmitState,
22    // Required to be RealRegs.
23    src: Reg,
24    dst: Writable<Reg>,
25    to_f64: bool,
26) {
27    // Handle an unsigned int, which is the "easy" case: a signed conversion will do the
28    // right thing.
29    let op = if to_f64 {
30        SseOpcode::Cvtsi2sd
31    } else {
32        SseOpcode::Cvtsi2ss
33    };
34    let inst = Inst::gpr_to_xmm(op, RegMem::reg(src), OperandSize::Size64, dst);
35    inst.emit(&[], sink, info, state);
36}
37
38/// Emits a one way conditional jump if CC is set (true).
39fn one_way_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) {
40    let cond_start = sink.cur_offset();
41    let cond_disp_off = cond_start + 2;
42    sink.use_label_at_offset(cond_disp_off, label, LabelUse::JmpRel32);
43    sink.put1(0x0F);
44    sink.put1(0x80 + cc.get_enc());
45    sink.put4(0x0);
46}
47
48/// Emits a relocation, attaching the current source location as well.
49fn emit_reloc(sink: &mut MachBuffer<Inst>, kind: Reloc, name: &ExternalName, addend: Addend) {
50    sink.add_reloc(kind, name, addend);
51}
52
53/// The top-level emit function.
54///
55/// Important!  Do not add improved (shortened) encoding cases to existing
56/// instructions without also adding tests for those improved encodings.  That
57/// is a dangerous game that leads to hard-to-track-down errors in the emitted
58/// code.
59///
60/// For all instructions, make sure to have test coverage for all of the
61/// following situations.  Do this by creating the cross product resulting from
62/// applying the following rules to each operand:
63///
64/// (1) for any insn that mentions a register: one test using a register from
65///     the group [rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi] and a second one
66///     using a register from the group [r8, r9, r10, r11, r12, r13, r14, r15].
67///     This helps detect incorrect REX prefix construction.
68///
69/// (2) for any insn that mentions a byte register: one test for each of the
70///     four encoding groups [al, cl, dl, bl], [spl, bpl, sil, dil],
71///     [r8b .. r11b] and [r12b .. r15b].  This checks that
72///     apparently-redundant REX prefixes are retained when required.
73///
74/// (3) for any insn that contains an immediate field, check the following
75///     cases: field is zero, field is in simm8 range (-128 .. 127), field is
76///     in simm32 range (-0x8000_0000 .. 0x7FFF_FFFF).  This is because some
77///     instructions that require a 32-bit immediate have a short-form encoding
78///     when the imm is in simm8 range.
79///
80/// Rules (1), (2) and (3) don't apply for registers within address expressions
81/// (`Addr`s).  Those are already pretty well tested, and the registers in them
82/// don't have any effect on the containing instruction (apart from possibly
83/// require REX prefix bits).
84///
85/// When choosing registers for a test, avoid using registers with the same
86/// offset within a given group.  For example, don't use rax and r8, since they
87/// both have the lowest 3 bits as 000, and so the test won't detect errors
88/// where those 3-bit register sub-fields are confused by the emitter.  Instead
89/// use (eg) rax (lo3 = 000) and r9 (lo3 = 001).  Similarly, don't use (eg) cl
90/// and bpl since they have the same offset in their group; use instead (eg) cl
91/// and sil.
92///
93/// For all instructions, also add a test that uses only low-half registers
94/// (rax .. rdi, xmm0 .. xmm7) etc, so as to check that any redundant REX
95/// prefixes are correctly omitted.  This low-half restriction must apply to
96/// _all_ registers in the insn, even those in address expressions.
97///
98/// Following these rules creates large numbers of test cases, but it's the
99/// only way to make the emitter reliable.
100///
101/// Known possible improvements:
102///
103/// * there's a shorter encoding for shl/shr/sar by a 1-bit immediate.  (Do we
104///   care?)
105pub(crate) fn emit(
106    inst: &Inst,
107    allocs: &mut AllocationConsumer<'_>,
108    sink: &mut MachBuffer<Inst>,
109    info: &EmitInfo,
110    state: &mut EmitState,
111) {
112    let matches_isa_flags = |iset_requirement: &InstructionSet| -> bool {
113        match iset_requirement {
114            // Cranelift assumes SSE2 at least.
115            InstructionSet::SSE | InstructionSet::SSE2 => true,
116            InstructionSet::SSSE3 => info.isa_flags.use_ssse3(),
117            InstructionSet::SSE41 => info.isa_flags.use_sse41(),
118            InstructionSet::SSE42 => info.isa_flags.use_sse42(),
119            InstructionSet::Popcnt => info.isa_flags.use_popcnt(),
120            InstructionSet::Lzcnt => info.isa_flags.use_lzcnt(),
121            InstructionSet::BMI1 => info.isa_flags.use_bmi1(),
122            InstructionSet::BMI2 => info.isa_flags.has_bmi2(),
123            InstructionSet::FMA => info.isa_flags.has_fma(),
124            InstructionSet::AVX => info.isa_flags.has_avx(),
125            InstructionSet::AVX2 => info.isa_flags.has_avx2(),
126            InstructionSet::AVX512BITALG => info.isa_flags.has_avx512bitalg(),
127            InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(),
128            InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
129            InstructionSet::AVX512VBMI => info.isa_flags.has_avx512vbmi(),
130            InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(),
131        }
132    };
133
134    // Certain instructions may be present in more than one ISA feature set; we must at least match
135    // one of them in the target CPU.
136    let isa_requirements = inst.available_in_any_isa();
137    if !isa_requirements.is_empty() && !isa_requirements.iter().all(matches_isa_flags) {
138        panic!(
139            "Cannot emit inst '{:?}' for target; failed to match ISA requirements: {:?}",
140            inst, isa_requirements
141        )
142    }
143
144    match inst {
145        Inst::AluRmiR {
146            size,
147            op,
148            src1,
149            src2,
150            dst: reg_g,
151        } => {
152            let src1 = allocs.next(src1.to_reg());
153            let reg_g = allocs.next(reg_g.to_reg().to_reg());
154            debug_assert_eq!(src1, reg_g);
155            let src2 = src2.clone().to_reg_mem_imm().with_allocs(allocs);
156
157            let rex = RexFlags::from(*size);
158            if *op == AluRmiROpcode::Mul {
159                // We kinda freeloaded Mul into RMI_R_Op, but it doesn't fit the usual pattern, so
160                // we have to special-case it.
161                match src2 {
162                    RegMemImm::Reg { reg: reg_e } => {
163                        emit_std_reg_reg(sink, LegacyPrefixes::None, 0x0FAF, 2, reg_g, reg_e, rex);
164                    }
165
166                    RegMemImm::Mem { addr } => {
167                        let amode = addr.finalize(state, sink);
168                        emit_std_reg_mem(
169                            sink,
170                            LegacyPrefixes::None,
171                            0x0FAF,
172                            2,
173                            reg_g,
174                            &amode,
175                            rex,
176                            0,
177                        );
178                    }
179
180                    RegMemImm::Imm { simm32 } => {
181                        let use_imm8 = low8_will_sign_extend_to_32(simm32);
182                        let opcode = if use_imm8 { 0x6B } else { 0x69 };
183                        // Yes, really, reg_g twice.
184                        emit_std_reg_reg(sink, LegacyPrefixes::None, opcode, 1, reg_g, reg_g, rex);
185                        emit_simm(sink, if use_imm8 { 1 } else { 4 }, simm32);
186                    }
187                }
188            } else {
189                let (opcode_r, opcode_m, subopcode_i) = match op {
190                    AluRmiROpcode::Add => (0x01, 0x03, 0),
191                    AluRmiROpcode::Adc => (0x11, 0x03, 0),
192                    AluRmiROpcode::Sub => (0x29, 0x2B, 5),
193                    AluRmiROpcode::Sbb => (0x19, 0x2B, 5),
194                    AluRmiROpcode::And => (0x21, 0x23, 4),
195                    AluRmiROpcode::Or => (0x09, 0x0B, 1),
196                    AluRmiROpcode::Xor => (0x31, 0x33, 6),
197                    AluRmiROpcode::Mul => panic!("unreachable"),
198                };
199
200                match src2 {
201                    RegMemImm::Reg { reg: reg_e } => {
202                        // GCC/llvm use the swapped operand encoding (viz., the R/RM vs RM/R
203                        // duality). Do this too, so as to be able to compare generated machine
204                        // code easily.
205                        emit_std_reg_reg(
206                            sink,
207                            LegacyPrefixes::None,
208                            opcode_r,
209                            1,
210                            reg_e,
211                            reg_g,
212                            rex,
213                        );
214                    }
215
216                    RegMemImm::Mem { addr } => {
217                        let amode = addr.finalize(state, sink);
218                        // Here we revert to the "normal" G-E ordering.
219                        emit_std_reg_mem(
220                            sink,
221                            LegacyPrefixes::None,
222                            opcode_m,
223                            1,
224                            reg_g,
225                            &amode,
226                            rex,
227                            0,
228                        );
229                    }
230
231                    RegMemImm::Imm { simm32 } => {
232                        let use_imm8 = low8_will_sign_extend_to_32(simm32);
233                        let opcode = if use_imm8 { 0x83 } else { 0x81 };
234                        // And also here we use the "normal" G-E ordering.
235                        let enc_g = int_reg_enc(reg_g);
236                        emit_std_enc_enc(
237                            sink,
238                            LegacyPrefixes::None,
239                            opcode,
240                            1,
241                            subopcode_i,
242                            enc_g,
243                            rex,
244                        );
245                        emit_simm(sink, if use_imm8 { 1 } else { 4 }, simm32);
246                    }
247                }
248            }
249        }
250
251        Inst::AluConstOp { op, size, dst } => {
252            let dst = allocs.next(dst.to_reg().to_reg());
253            emit(
254                &Inst::AluRmiR {
255                    size: *size,
256                    op: *op,
257                    dst: Writable::from_reg(Gpr::new(dst).unwrap()),
258                    src1: Gpr::new(dst).unwrap(),
259                    src2: Gpr::new(dst).unwrap().into(),
260                },
261                allocs,
262                sink,
263                info,
264                state,
265            );
266        }
267
268        Inst::AluRM {
269            size,
270            src1_dst,
271            src2,
272            op,
273        } => {
274            let src2 = allocs.next(src2.to_reg());
275            let src1_dst = src1_dst.finalize(state, sink).with_allocs(allocs);
276
277            assert!(*size == OperandSize::Size32 || *size == OperandSize::Size64);
278            let opcode = match op {
279                AluRmiROpcode::Add => 0x01,
280                AluRmiROpcode::Sub => 0x29,
281                AluRmiROpcode::And => 0x21,
282                AluRmiROpcode::Or => 0x09,
283                AluRmiROpcode::Xor => 0x31,
284                _ => panic!("Unsupported read-modify-write ALU opcode"),
285            };
286            let enc_g = int_reg_enc(src2);
287            emit_std_enc_mem(
288                sink,
289                LegacyPrefixes::None,
290                opcode,
291                1,
292                enc_g,
293                &src1_dst,
294                RexFlags::from(*size),
295                0,
296            );
297        }
298
299        Inst::AluRmRVex {
300            size,
301            op,
302            dst,
303            src1,
304            src2,
305        } => {
306            use AluRmROpcode::*;
307            let dst = allocs.next(dst.to_reg().to_reg());
308            let src1 = allocs.next(src1.to_reg());
309            let src2 = allocs.next(src2.to_reg());
310
311            let w = match size {
312                OperandSize::Size32 => false,
313                OperandSize::Size64 => true,
314
315                // the other cases would be rejected by isle constructors
316                _ => unreachable!(),
317            };
318
319            let opcode = match op {
320                Andn => 0xf2,
321            };
322
323            VexInstruction::new()
324                .map(OpcodeMap::_0F38)
325                .w(w)
326                .reg(dst.to_real_reg().unwrap().hw_enc())
327                .vvvv(src1.to_real_reg().unwrap().hw_enc())
328                .rm(src2.to_real_reg().unwrap().hw_enc())
329                .opcode(opcode)
330                .encode(sink);
331        }
332
333        Inst::UnaryRmR { size, op, src, dst } => {
334            let dst = allocs.next(dst.to_reg().to_reg());
335            let rex_flags = RexFlags::from(*size);
336            use UnaryRmROpcode::*;
337            let prefix = match size {
338                OperandSize::Size16 => match op {
339                    Bsr | Bsf => LegacyPrefixes::_66,
340                    Lzcnt | Tzcnt | Popcnt => LegacyPrefixes::_66F3,
341                },
342                OperandSize::Size32 | OperandSize::Size64 => match op {
343                    Bsr | Bsf => LegacyPrefixes::None,
344                    Lzcnt | Tzcnt | Popcnt => LegacyPrefixes::_F3,
345                },
346                _ => unreachable!(),
347            };
348
349            let (opcode, num_opcodes) = match op {
350                Bsr => (0x0fbd, 2),
351                Bsf => (0x0fbc, 2),
352                Lzcnt => (0x0fbd, 2),
353                Tzcnt => (0x0fbc, 2),
354                Popcnt => (0x0fb8, 2),
355            };
356
357            match src.clone().into() {
358                RegMem::Reg { reg: src } => {
359                    let src = allocs.next(src);
360                    emit_std_reg_reg(sink, prefix, opcode, num_opcodes, dst, src, rex_flags);
361                }
362                RegMem::Mem { addr: src } => {
363                    let amode = src.finalize(state, sink).with_allocs(allocs);
364                    emit_std_reg_mem(sink, prefix, opcode, num_opcodes, dst, &amode, rex_flags, 0);
365                }
366            }
367        }
368
369        Inst::Not { size, src, dst } => {
370            let src = allocs.next(src.to_reg());
371            let dst = allocs.next(dst.to_reg().to_reg());
372            debug_assert_eq!(src, dst);
373            let rex_flags = RexFlags::from((*size, dst));
374            let (opcode, prefix) = match size {
375                OperandSize::Size8 => (0xF6, LegacyPrefixes::None),
376                OperandSize::Size16 => (0xF7, LegacyPrefixes::_66),
377                OperandSize::Size32 => (0xF7, LegacyPrefixes::None),
378                OperandSize::Size64 => (0xF7, LegacyPrefixes::None),
379            };
380
381            let subopcode = 2;
382            let enc_src = int_reg_enc(dst);
383            emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_src, rex_flags)
384        }
385
386        Inst::Neg { size, src, dst } => {
387            let src = allocs.next(src.to_reg());
388            let dst = allocs.next(dst.to_reg().to_reg());
389            debug_assert_eq!(src, dst);
390            let rex_flags = RexFlags::from((*size, dst));
391            let (opcode, prefix) = match size {
392                OperandSize::Size8 => (0xF6, LegacyPrefixes::None),
393                OperandSize::Size16 => (0xF7, LegacyPrefixes::_66),
394                OperandSize::Size32 => (0xF7, LegacyPrefixes::None),
395                OperandSize::Size64 => (0xF7, LegacyPrefixes::None),
396            };
397
398            let subopcode = 3;
399            let enc_src = int_reg_enc(dst);
400            emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_src, rex_flags)
401        }
402
403        Inst::Div {
404            sign,
405            trap,
406            divisor,
407            ..
408        }
409        | Inst::Div8 {
410            sign,
411            trap,
412            divisor,
413            ..
414        } => {
415            let divisor = divisor.clone().to_reg_mem().with_allocs(allocs);
416            let size = match inst {
417                Inst::Div {
418                    size,
419                    dividend_lo,
420                    dividend_hi,
421                    dst_quotient,
422                    dst_remainder,
423                    ..
424                } => {
425                    let dividend_lo = allocs.next(dividend_lo.to_reg());
426                    let dividend_hi = allocs.next(dividend_hi.to_reg());
427                    let dst_quotient = allocs.next(dst_quotient.to_reg().to_reg());
428                    let dst_remainder = allocs.next(dst_remainder.to_reg().to_reg());
429                    debug_assert_eq!(dividend_lo, regs::rax());
430                    debug_assert_eq!(dividend_hi, regs::rdx());
431                    debug_assert_eq!(dst_quotient, regs::rax());
432                    debug_assert_eq!(dst_remainder, regs::rdx());
433                    *size
434                }
435                Inst::Div8 { dividend, dst, .. } => {
436                    let dividend = allocs.next(dividend.to_reg());
437                    let dst = allocs.next(dst.to_reg().to_reg());
438                    debug_assert_eq!(dividend, regs::rax());
439                    debug_assert_eq!(dst, regs::rax());
440                    OperandSize::Size8
441                }
442                _ => unreachable!(),
443            };
444
445            let (opcode, prefix) = match size {
446                OperandSize::Size8 => (0xF6, LegacyPrefixes::None),
447                OperandSize::Size16 => (0xF7, LegacyPrefixes::_66),
448                OperandSize::Size32 => (0xF7, LegacyPrefixes::None),
449                OperandSize::Size64 => (0xF7, LegacyPrefixes::None),
450            };
451
452            sink.add_trap(*trap);
453
454            let subopcode = match sign {
455                DivSignedness::Signed => 7,
456                DivSignedness::Unsigned => 6,
457            };
458            match divisor {
459                RegMem::Reg { reg } => {
460                    let src = int_reg_enc(reg);
461                    emit_std_enc_enc(
462                        sink,
463                        prefix,
464                        opcode,
465                        1,
466                        subopcode,
467                        src,
468                        RexFlags::from((size, reg)),
469                    )
470                }
471                RegMem::Mem { addr: src } => {
472                    let amode = src.finalize(state, sink);
473                    emit_std_enc_mem(
474                        sink,
475                        prefix,
476                        opcode,
477                        1,
478                        subopcode,
479                        &amode,
480                        RexFlags::from(size),
481                        0,
482                    );
483                }
484            }
485        }
486
487        Inst::MulHi {
488            size,
489            signed,
490            src1,
491            src2,
492            dst_lo,
493            dst_hi,
494        } => {
495            let src1 = allocs.next(src1.to_reg());
496            let dst_lo = allocs.next(dst_lo.to_reg().to_reg());
497            let dst_hi = allocs.next(dst_hi.to_reg().to_reg());
498            debug_assert_eq!(src1, regs::rax());
499            debug_assert_eq!(dst_lo, regs::rax());
500            debug_assert_eq!(dst_hi, regs::rdx());
501
502            let rex_flags = RexFlags::from(*size);
503            let prefix = match size {
504                OperandSize::Size16 => LegacyPrefixes::_66,
505                OperandSize::Size32 => LegacyPrefixes::None,
506                OperandSize::Size64 => LegacyPrefixes::None,
507                _ => unreachable!(),
508            };
509
510            let subopcode = if *signed { 5 } else { 4 };
511            match src2.clone().to_reg_mem() {
512                RegMem::Reg { reg } => {
513                    let reg = allocs.next(reg);
514                    let src = int_reg_enc(reg);
515                    emit_std_enc_enc(sink, prefix, 0xF7, 1, subopcode, src, rex_flags)
516                }
517                RegMem::Mem { addr: src } => {
518                    let amode = src.finalize(state, sink).with_allocs(allocs);
519                    emit_std_enc_mem(sink, prefix, 0xF7, 1, subopcode, &amode, rex_flags, 0);
520                }
521            }
522        }
523
524        Inst::SignExtendData { size, src, dst } => {
525            let src = allocs.next(src.to_reg());
526            let dst = allocs.next(dst.to_reg().to_reg());
527            debug_assert_eq!(src, regs::rax());
528            if *size == OperandSize::Size8 {
529                debug_assert_eq!(dst, regs::rax());
530            } else {
531                debug_assert_eq!(dst, regs::rdx());
532            }
533            match size {
534                OperandSize::Size8 => {
535                    sink.put1(0x66);
536                    sink.put1(0x98);
537                }
538                OperandSize::Size16 => {
539                    sink.put1(0x66);
540                    sink.put1(0x99);
541                }
542                OperandSize::Size32 => sink.put1(0x99),
543                OperandSize::Size64 => {
544                    sink.put1(0x48);
545                    sink.put1(0x99);
546                }
547            }
548        }
549
550        Inst::CheckedSRemSeq { divisor, .. } | Inst::CheckedSRemSeq8 { divisor, .. } => {
551            let divisor = allocs.next(divisor.to_reg());
552
553            // Validate that the register constraints of the dividend and the
554            // destination are all as expected.
555            let (dst, size) = match inst {
556                Inst::CheckedSRemSeq {
557                    dividend_lo,
558                    dividend_hi,
559                    dst_quotient,
560                    dst_remainder,
561                    size,
562                    ..
563                } => {
564                    let dividend_lo = allocs.next(dividend_lo.to_reg());
565                    let dividend_hi = allocs.next(dividend_hi.to_reg());
566                    let dst_quotient = allocs.next(dst_quotient.to_reg().to_reg());
567                    let dst_remainder = allocs.next(dst_remainder.to_reg().to_reg());
568                    debug_assert_eq!(dividend_lo, regs::rax());
569                    debug_assert_eq!(dividend_hi, regs::rdx());
570                    debug_assert_eq!(dst_quotient, regs::rax());
571                    debug_assert_eq!(dst_remainder, regs::rdx());
572                    (regs::rdx(), *size)
573                }
574                Inst::CheckedSRemSeq8 { dividend, dst, .. } => {
575                    let dividend = allocs.next(dividend.to_reg());
576                    let dst = allocs.next(dst.to_reg().to_reg());
577                    debug_assert_eq!(dividend, regs::rax());
578                    debug_assert_eq!(dst, regs::rax());
579                    (regs::rax(), OperandSize::Size8)
580                }
581                _ => unreachable!(),
582            };
583
584            // Generates the following code sequence:
585            //
586            // cmp -1 %divisor
587            // jnz $do_op
588            //
589            // ;; for srem, result is 0
590            // mov #0, %dst
591            // j $done
592            //
593            // $do_op:
594            // idiv %divisor
595            //
596            // $done:
597
598            let do_op = sink.get_label();
599            let done_label = sink.get_label();
600
601            // Check if the divisor is -1, and if it isn't then immediately
602            // go to the `idiv`.
603            let inst = Inst::cmp_rmi_r(size, RegMemImm::imm(0xffffffff), divisor);
604            inst.emit(&[], sink, info, state);
605            one_way_jmp(sink, CC::NZ, do_op);
606
607            // ... otherwise the divisor is -1 and the result is always 0. This
608            // is written to the destination register which will be %rax for
609            // 8-bit srem and %rdx otherwise.
610            //
611            // Note that for 16-to-64-bit srem operations this leaves the
612            // second destination, %rax, unchanged. This isn't semantically
613            // correct if a lowering actually tries to use the `dst_quotient`
614            // output but for srem only the `dst_remainder` output is used for
615            // now.
616            let inst = Inst::imm(OperandSize::Size64, 0, Writable::from_reg(dst));
617            inst.emit(&[], sink, info, state);
618            let inst = Inst::jmp_known(done_label);
619            inst.emit(&[], sink, info, state);
620
621            // Here the `idiv` is executed, which is different depending on the
622            // size
623            sink.bind_label(do_op);
624            let inst = match size {
625                OperandSize::Size8 => Inst::div8(
626                    DivSignedness::Signed,
627                    TrapCode::IntegerDivisionByZero,
628                    RegMem::reg(divisor),
629                    Gpr::new(regs::rax()).unwrap(),
630                    Writable::from_reg(Gpr::new(regs::rax()).unwrap()),
631                ),
632                _ => Inst::div(
633                    size,
634                    DivSignedness::Signed,
635                    TrapCode::IntegerDivisionByZero,
636                    RegMem::reg(divisor),
637                    Gpr::new(regs::rax()).unwrap(),
638                    Gpr::new(regs::rdx()).unwrap(),
639                    Writable::from_reg(Gpr::new(regs::rax()).unwrap()),
640                    Writable::from_reg(Gpr::new(regs::rdx()).unwrap()),
641                ),
642            };
643            inst.emit(&[], sink, info, state);
644
645            sink.bind_label(done_label);
646        }
647
648        Inst::Imm {
649            dst_size,
650            simm64,
651            dst,
652        } => {
653            let dst = allocs.next(dst.to_reg().to_reg());
654            let enc_dst = int_reg_enc(dst);
655            if *dst_size == OperandSize::Size64 {
656                if low32_will_sign_extend_to_64(*simm64) {
657                    // Sign-extended move imm32.
658                    emit_std_enc_enc(
659                        sink,
660                        LegacyPrefixes::None,
661                        0xC7,
662                        1,
663                        /* subopcode */ 0,
664                        enc_dst,
665                        RexFlags::set_w(),
666                    );
667                    sink.put4(*simm64 as u32);
668                } else {
669                    sink.put1(0x48 | ((enc_dst >> 3) & 1));
670                    sink.put1(0xB8 | (enc_dst & 7));
671                    sink.put8(*simm64);
672                }
673            } else {
674                if ((enc_dst >> 3) & 1) == 1 {
675                    sink.put1(0x41);
676                }
677                sink.put1(0xB8 | (enc_dst & 7));
678                sink.put4(*simm64 as u32);
679            }
680        }
681
682        Inst::MovImmM { size, simm64, dst } => {
683            let dst = &dst.finalize(state, sink).with_allocs(allocs);
684            let default_rex = RexFlags::clear_w();
685            let default_opcode = 0xC7;
686            let bytes = size.to_bytes();
687            let prefix = LegacyPrefixes::None;
688
689            let (opcode, rex, size, prefix) = match *size {
690                // In the 8-bit case, we don't need to enforce REX flags via
691                // `always_emit_if_8bit_needed()` since the destination
692                // operand is a memory operand, not a possibly 8-bit register.
693                OperandSize::Size8 => (0xC6, default_rex, bytes, prefix),
694                OperandSize::Size16 => (0xC7, default_rex, bytes, LegacyPrefixes::_66),
695                OperandSize::Size64 => {
696                    if !low32_will_sign_extend_to_64(*simm64) {
697                        panic!("Immediate-to-memory moves require immediate operand to sign-extend to 64 bits.");
698                    }
699
700                    (default_opcode, RexFlags::from(*size), bytes, prefix)
701                }
702
703                _ => (default_opcode, default_rex, bytes, prefix),
704            };
705
706            // 8-bit C6 /0 ib
707            // 16-bit 0x66 C7 /0 iw
708            // 32-bit C7 /0 id
709            // 64-bit REX.W C7 /0 id
710            emit_std_enc_mem(sink, prefix, opcode, 1, /*subopcode*/ 0, dst, rex, 0);
711            emit_simm(sink, size, *simm64 as u32);
712        }
713
714        Inst::MovRR { size, src, dst } => {
715            let src = allocs.next(src.to_reg());
716            let dst = allocs.next(dst.to_reg().to_reg());
717            emit_std_reg_reg(
718                sink,
719                LegacyPrefixes::None,
720                0x89,
721                1,
722                src,
723                dst,
724                RexFlags::from(*size),
725            );
726        }
727
728        Inst::MovFromPReg { src, dst } => {
729            allocs.next_fixed_nonallocatable(*src);
730            let src: Reg = (*src).into();
731            debug_assert!([regs::rsp(), regs::rbp(), regs::pinned_reg()].contains(&src));
732            let src = Gpr::new(src).unwrap();
733            let size = OperandSize::Size64;
734            let dst = allocs.next(dst.to_reg().to_reg());
735            let dst = WritableGpr::from_writable_reg(Writable::from_reg(dst)).unwrap();
736            Inst::MovRR { size, src, dst }.emit(&[], sink, info, state);
737        }
738
739        Inst::MovToPReg { src, dst } => {
740            let src = allocs.next(src.to_reg());
741            let src = Gpr::new(src).unwrap();
742            allocs.next_fixed_nonallocatable(*dst);
743            let dst: Reg = (*dst).into();
744            debug_assert!([regs::rsp(), regs::rbp(), regs::pinned_reg()].contains(&dst));
745            let dst = WritableGpr::from_writable_reg(Writable::from_reg(dst)).unwrap();
746            let size = OperandSize::Size64;
747            Inst::MovRR { size, src, dst }.emit(&[], sink, info, state);
748        }
749
750        Inst::MovzxRmR { ext_mode, src, dst } => {
751            let dst = allocs.next(dst.to_reg().to_reg());
752            let (opcodes, num_opcodes, mut rex_flags) = match ext_mode {
753                ExtMode::BL => {
754                    // MOVZBL is (REX.W==0) 0F B6 /r
755                    (0x0FB6, 2, RexFlags::clear_w())
756                }
757                ExtMode::BQ => {
758                    // MOVZBQ is (REX.W==1) 0F B6 /r
759                    // I'm not sure why the Intel manual offers different
760                    // encodings for MOVZBQ than for MOVZBL.  AIUI they should
761                    // achieve the same, since MOVZBL is just going to zero out
762                    // the upper half of the destination anyway.
763                    (0x0FB6, 2, RexFlags::set_w())
764                }
765                ExtMode::WL => {
766                    // MOVZWL is (REX.W==0) 0F B7 /r
767                    (0x0FB7, 2, RexFlags::clear_w())
768                }
769                ExtMode::WQ => {
770                    // MOVZWQ is (REX.W==1) 0F B7 /r
771                    (0x0FB7, 2, RexFlags::set_w())
772                }
773                ExtMode::LQ => {
774                    // This is just a standard 32 bit load, and we rely on the
775                    // default zero-extension rule to perform the extension.
776                    // Note that in reg/reg mode, gcc seems to use the swapped form R/RM, which we
777                    // don't do here, since it's the same encoding size.
778                    // MOV r/m32, r32 is (REX.W==0) 8B /r
779                    (0x8B, 1, RexFlags::clear_w())
780                }
781            };
782
783            match src.clone().to_reg_mem() {
784                RegMem::Reg { reg: src } => {
785                    let src = allocs.next(src);
786                    match ext_mode {
787                        ExtMode::BL | ExtMode::BQ => {
788                            // A redundant REX prefix must be emitted for certain register inputs.
789                            rex_flags.always_emit_if_8bit_needed(src);
790                        }
791                        _ => {}
792                    }
793                    emit_std_reg_reg(
794                        sink,
795                        LegacyPrefixes::None,
796                        opcodes,
797                        num_opcodes,
798                        dst,
799                        src,
800                        rex_flags,
801                    )
802                }
803
804                RegMem::Mem { addr: src } => {
805                    let src = &src.finalize(state, sink).with_allocs(allocs);
806
807                    emit_std_reg_mem(
808                        sink,
809                        LegacyPrefixes::None,
810                        opcodes,
811                        num_opcodes,
812                        dst,
813                        src,
814                        rex_flags,
815                        0,
816                    )
817                }
818            }
819        }
820
821        Inst::Mov64MR { src, dst } => {
822            let dst = allocs.next(dst.to_reg().to_reg());
823            let src = &src.finalize(state, sink).with_allocs(allocs);
824
825            emit_std_reg_mem(
826                sink,
827                LegacyPrefixes::None,
828                0x8B,
829                1,
830                dst,
831                src,
832                RexFlags::set_w(),
833                0,
834            )
835        }
836
837        Inst::LoadEffectiveAddress { addr, dst, size } => {
838            let dst = allocs.next(dst.to_reg().to_reg());
839            let amode = addr.finalize(state, sink).with_allocs(allocs);
840
841            // If this `lea` can actually get encoded as an `add` then do that
842            // instead. Currently all candidate `iadd`s become an `lea`
843            // pseudo-instruction here but maximizing the sue of `lea` is not
844            // necessarily optimal. The `lea` instruction goes through dedicated
845            // address units on cores which are finite and disjoint from the
846            // general ALU, so if everything uses `lea` then those units can get
847            // saturated while leaving the ALU idle.
848            //
849            // To help make use of more parts of a cpu, this attempts to use
850            // `add` when it's semantically equivalent to `lea`, or otherwise
851            // when the `dst` register is the same as the `base` or `index`
852            // register.
853            //
854            // FIXME: ideally regalloc is informed of this constraint. Register
855            // allocation of `lea` should "attempt" to put the `base` in the
856            // same register as `dst` but not at the expense of generating a
857            // `mov` instruction. Currently that's not possible but perhaps one
858            // day it may be worth it.
859            match amode {
860                // If `base == dst` then this is `add $imm, %dst`, so encode
861                // that instead.
862                Amode::ImmReg {
863                    simm32,
864                    base,
865                    flags: _,
866                } if base == dst => {
867                    let inst = Inst::alu_rmi_r(
868                        *size,
869                        AluRmiROpcode::Add,
870                        RegMemImm::imm(simm32),
871                        Writable::from_reg(dst),
872                    );
873                    inst.emit(&[], sink, info, state);
874                }
875                // If the offset is 0 and the shift is 0 (meaning multiplication
876                // by 1) then:
877                //
878                // * If `base == dst`, then this is `add %index, %base`
879                // * If `index == dst`, then this is `add %base, %index`
880                //
881                // Encode the appropriate instruction here in that case.
882                Amode::ImmRegRegShift {
883                    simm32: 0,
884                    base,
885                    index,
886                    shift: 0,
887                    flags: _,
888                } if base == dst || index == dst => {
889                    let (dst, operand) = if base == dst {
890                        (base, index)
891                    } else {
892                        (index, base)
893                    };
894                    let inst = Inst::alu_rmi_r(
895                        *size,
896                        AluRmiROpcode::Add,
897                        RegMemImm::reg(operand.to_reg()),
898                        Writable::from_reg(dst.to_reg()),
899                    );
900                    inst.emit(&[], sink, info, state);
901                }
902
903                // If `lea`'s 3-operand mode is leveraged by regalloc, or if
904                // it's fancy like imm-plus-shift-plus-base, then `lea` is
905                // actually emitted.
906                _ => {
907                    let flags = match size {
908                        OperandSize::Size32 => RexFlags::clear_w(),
909                        OperandSize::Size64 => RexFlags::set_w(),
910                        _ => unreachable!(),
911                    };
912                    emit_std_reg_mem(sink, LegacyPrefixes::None, 0x8D, 1, dst, &amode, flags, 0);
913                }
914            };
915        }
916
917        Inst::MovsxRmR { ext_mode, src, dst } => {
918            let dst = allocs.next(dst.to_reg().to_reg());
919            let (opcodes, num_opcodes, mut rex_flags) = match ext_mode {
920                ExtMode::BL => {
921                    // MOVSBL is (REX.W==0) 0F BE /r
922                    (0x0FBE, 2, RexFlags::clear_w())
923                }
924                ExtMode::BQ => {
925                    // MOVSBQ is (REX.W==1) 0F BE /r
926                    (0x0FBE, 2, RexFlags::set_w())
927                }
928                ExtMode::WL => {
929                    // MOVSWL is (REX.W==0) 0F BF /r
930                    (0x0FBF, 2, RexFlags::clear_w())
931                }
932                ExtMode::WQ => {
933                    // MOVSWQ is (REX.W==1) 0F BF /r
934                    (0x0FBF, 2, RexFlags::set_w())
935                }
936                ExtMode::LQ => {
937                    // MOVSLQ is (REX.W==1) 63 /r
938                    (0x63, 1, RexFlags::set_w())
939                }
940            };
941
942            match src.clone().to_reg_mem() {
943                RegMem::Reg { reg: src } => {
944                    let src = allocs.next(src);
945                    match ext_mode {
946                        ExtMode::BL | ExtMode::BQ => {
947                            // A redundant REX prefix must be emitted for certain register inputs.
948                            rex_flags.always_emit_if_8bit_needed(src);
949                        }
950                        _ => {}
951                    }
952                    emit_std_reg_reg(
953                        sink,
954                        LegacyPrefixes::None,
955                        opcodes,
956                        num_opcodes,
957                        dst,
958                        src,
959                        rex_flags,
960                    )
961                }
962
963                RegMem::Mem { addr: src } => {
964                    let src = &src.finalize(state, sink).with_allocs(allocs);
965
966                    emit_std_reg_mem(
967                        sink,
968                        LegacyPrefixes::None,
969                        opcodes,
970                        num_opcodes,
971                        dst,
972                        src,
973                        rex_flags,
974                        0,
975                    )
976                }
977            }
978        }
979
980        Inst::MovRM { size, src, dst } => {
981            let src = allocs.next(src.to_reg());
982            let dst = &dst.finalize(state, sink).with_allocs(allocs);
983
984            let prefix = match size {
985                OperandSize::Size16 => LegacyPrefixes::_66,
986                _ => LegacyPrefixes::None,
987            };
988
989            let opcode = match size {
990                OperandSize::Size8 => 0x88,
991                _ => 0x89,
992            };
993
994            // This is one of the few places where the presence of a
995            // redundant REX prefix changes the meaning of the
996            // instruction.
997            let rex = RexFlags::from((*size, src));
998
999            //  8-bit: MOV r8, r/m8 is (REX.W==0) 88 /r
1000            // 16-bit: MOV r16, r/m16 is 66 (REX.W==0) 89 /r
1001            // 32-bit: MOV r32, r/m32 is (REX.W==0) 89 /r
1002            // 64-bit: MOV r64, r/m64 is (REX.W==1) 89 /r
1003            emit_std_reg_mem(sink, prefix, opcode, 1, src, dst, rex, 0);
1004        }
1005
1006        Inst::ShiftR {
1007            size,
1008            kind,
1009            src,
1010            num_bits,
1011            dst,
1012        } => {
1013            let src = allocs.next(src.to_reg());
1014            let dst = allocs.next(dst.to_reg().to_reg());
1015            debug_assert_eq!(src, dst);
1016            let subopcode = match kind {
1017                ShiftKind::RotateLeft => 0,
1018                ShiftKind::RotateRight => 1,
1019                ShiftKind::ShiftLeft => 4,
1020                ShiftKind::ShiftRightLogical => 5,
1021                ShiftKind::ShiftRightArithmetic => 7,
1022            };
1023            let enc_dst = int_reg_enc(dst);
1024            let rex_flags = RexFlags::from((*size, dst));
1025            match num_bits.clone().to_imm8_reg() {
1026                Imm8Reg::Reg { reg } => {
1027                    let reg = allocs.next(reg);
1028                    debug_assert_eq!(reg, regs::rcx());
1029                    let (opcode, prefix) = match size {
1030                        OperandSize::Size8 => (0xD2, LegacyPrefixes::None),
1031                        OperandSize::Size16 => (0xD3, LegacyPrefixes::_66),
1032                        OperandSize::Size32 => (0xD3, LegacyPrefixes::None),
1033                        OperandSize::Size64 => (0xD3, LegacyPrefixes::None),
1034                    };
1035
1036                    // SHL/SHR/SAR %cl, reg8 is (REX.W==0) D2 /subopcode
1037                    // SHL/SHR/SAR %cl, reg16 is 66 (REX.W==0) D3 /subopcode
1038                    // SHL/SHR/SAR %cl, reg32 is (REX.W==0) D3 /subopcode
1039                    // SHL/SHR/SAR %cl, reg64 is (REX.W==1) D3 /subopcode
1040                    emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_dst, rex_flags);
1041                }
1042
1043                Imm8Reg::Imm8 { imm: num_bits } => {
1044                    let (opcode, prefix) = match size {
1045                        OperandSize::Size8 => (0xC0, LegacyPrefixes::None),
1046                        OperandSize::Size16 => (0xC1, LegacyPrefixes::_66),
1047                        OperandSize::Size32 => (0xC1, LegacyPrefixes::None),
1048                        OperandSize::Size64 => (0xC1, LegacyPrefixes::None),
1049                    };
1050
1051                    // SHL/SHR/SAR $ib, reg8 is (REX.W==0) C0 /subopcode
1052                    // SHL/SHR/SAR $ib, reg16 is 66 (REX.W==0) C1 /subopcode
1053                    // SHL/SHR/SAR $ib, reg32 is (REX.W==0) C1 /subopcode ib
1054                    // SHL/SHR/SAR $ib, reg64 is (REX.W==1) C1 /subopcode ib
1055                    // When the shift amount is 1, there's an even shorter encoding, but we don't
1056                    // bother with that nicety here.
1057                    emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_dst, rex_flags);
1058                    sink.put1(num_bits);
1059                }
1060            }
1061        }
1062
1063        Inst::XmmRmiReg {
1064            opcode,
1065            src1,
1066            src2,
1067            dst,
1068        } => {
1069            let src1 = allocs.next(src1.to_reg());
1070            let dst = allocs.next(dst.to_reg().to_reg());
1071            debug_assert_eq!(src1, dst);
1072            let rex = RexFlags::clear_w();
1073            let prefix = LegacyPrefixes::_66;
1074            let src2 = src2.clone().to_reg_mem_imm();
1075            if let RegMemImm::Imm { simm32 } = src2 {
1076                let (opcode_bytes, reg_digit) = match opcode {
1077                    SseOpcode::Psllw => (0x0F71, 6),
1078                    SseOpcode::Pslld => (0x0F72, 6),
1079                    SseOpcode::Psllq => (0x0F73, 6),
1080                    SseOpcode::Psraw => (0x0F71, 4),
1081                    SseOpcode::Psrad => (0x0F72, 4),
1082                    SseOpcode::Psrlw => (0x0F71, 2),
1083                    SseOpcode::Psrld => (0x0F72, 2),
1084                    SseOpcode::Psrlq => (0x0F73, 2),
1085                    _ => panic!("invalid opcode: {}", opcode),
1086                };
1087                let dst_enc = reg_enc(dst);
1088                emit_std_enc_enc(sink, prefix, opcode_bytes, 2, reg_digit, dst_enc, rex);
1089                let imm = (simm32)
1090                    .try_into()
1091                    .expect("the immediate must be convertible to a u8");
1092                sink.put1(imm);
1093            } else {
1094                let opcode_bytes = match opcode {
1095                    SseOpcode::Psllw => 0x0FF1,
1096                    SseOpcode::Pslld => 0x0FF2,
1097                    SseOpcode::Psllq => 0x0FF3,
1098                    SseOpcode::Psraw => 0x0FE1,
1099                    SseOpcode::Psrad => 0x0FE2,
1100                    SseOpcode::Psrlw => 0x0FD1,
1101                    SseOpcode::Psrld => 0x0FD2,
1102                    SseOpcode::Psrlq => 0x0FD3,
1103                    _ => panic!("invalid opcode: {}", opcode),
1104                };
1105
1106                match src2 {
1107                    RegMemImm::Reg { reg } => {
1108                        let reg = allocs.next(reg);
1109                        emit_std_reg_reg(sink, prefix, opcode_bytes, 2, dst, reg, rex);
1110                    }
1111                    RegMemImm::Mem { addr } => {
1112                        let addr = &addr.finalize(state, sink).with_allocs(allocs);
1113                        emit_std_reg_mem(sink, prefix, opcode_bytes, 2, dst, addr, rex, 0);
1114                    }
1115                    RegMemImm::Imm { .. } => unreachable!(),
1116                }
1117            };
1118        }
1119
1120        Inst::CmpRmiR {
1121            size,
1122            src: src_e,
1123            dst: reg_g,
1124            opcode,
1125        } => {
1126            let reg_g = allocs.next(reg_g.to_reg());
1127
1128            let is_cmp = match opcode {
1129                CmpOpcode::Cmp => true,
1130                CmpOpcode::Test => false,
1131            };
1132
1133            let mut prefix = LegacyPrefixes::None;
1134            if *size == OperandSize::Size16 {
1135                prefix = LegacyPrefixes::_66;
1136            }
1137            // A redundant REX prefix can change the meaning of this instruction.
1138            let mut rex = RexFlags::from((*size, reg_g));
1139
1140            match src_e.clone().to_reg_mem_imm() {
1141                RegMemImm::Reg { reg: reg_e } => {
1142                    let reg_e = allocs.next(reg_e);
1143                    if *size == OperandSize::Size8 {
1144                        // Check whether the E register forces the use of a redundant REX.
1145                        rex.always_emit_if_8bit_needed(reg_e);
1146                    }
1147
1148                    // Use the swapped operands encoding for CMP, to stay consistent with the output of
1149                    // gcc/llvm.
1150                    let opcode = match (*size, is_cmp) {
1151                        (OperandSize::Size8, true) => 0x38,
1152                        (_, true) => 0x39,
1153                        (OperandSize::Size8, false) => 0x84,
1154                        (_, false) => 0x85,
1155                    };
1156                    emit_std_reg_reg(sink, prefix, opcode, 1, reg_e, reg_g, rex);
1157                }
1158
1159                RegMemImm::Mem { addr } => {
1160                    let addr = &addr.finalize(state, sink).with_allocs(allocs);
1161                    // Whereas here we revert to the "normal" G-E ordering for CMP.
1162                    let opcode = match (*size, is_cmp) {
1163                        (OperandSize::Size8, true) => 0x3A,
1164                        (_, true) => 0x3B,
1165                        (OperandSize::Size8, false) => 0x84,
1166                        (_, false) => 0x85,
1167                    };
1168                    emit_std_reg_mem(sink, prefix, opcode, 1, reg_g, addr, rex, 0);
1169                }
1170
1171                RegMemImm::Imm { simm32 } => {
1172                    // FIXME JRS 2020Feb11: there are shorter encodings for
1173                    // cmp $imm, rax/eax/ax/al.
1174                    let use_imm8 = is_cmp && low8_will_sign_extend_to_32(simm32);
1175
1176                    // And also here we use the "normal" G-E ordering.
1177                    let opcode = if is_cmp {
1178                        if *size == OperandSize::Size8 {
1179                            0x80
1180                        } else if use_imm8 {
1181                            0x83
1182                        } else {
1183                            0x81
1184                        }
1185                    } else {
1186                        if *size == OperandSize::Size8 {
1187                            0xF6
1188                        } else {
1189                            0xF7
1190                        }
1191                    };
1192                    let subopcode = if is_cmp { 7 } else { 0 };
1193
1194                    let enc_g = int_reg_enc(reg_g);
1195                    emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_g, rex);
1196                    emit_simm(sink, if use_imm8 { 1 } else { size.to_bytes() }, simm32);
1197                }
1198            }
1199        }
1200
1201        Inst::Setcc { cc, dst } => {
1202            let dst = allocs.next(dst.to_reg().to_reg());
1203            let opcode = 0x0f90 + cc.get_enc() as u32;
1204            let mut rex_flags = RexFlags::clear_w();
1205            rex_flags.always_emit();
1206            emit_std_enc_enc(
1207                sink,
1208                LegacyPrefixes::None,
1209                opcode,
1210                2,
1211                0,
1212                reg_enc(dst),
1213                rex_flags,
1214            );
1215        }
1216
1217        Inst::Bswap { size, src, dst } => {
1218            let src = allocs.next(src.to_reg());
1219            let dst = allocs.next(dst.to_reg().to_reg());
1220            debug_assert_eq!(src, dst);
1221            let enc_reg = int_reg_enc(dst);
1222
1223            // BSWAP reg32 is (REX.W==0) 0F C8
1224            // BSWAP reg64 is (REX.W==1) 0F C8
1225            let rex_flags = RexFlags::from(*size);
1226            rex_flags.emit_one_op(sink, enc_reg);
1227
1228            sink.put1(0x0F);
1229            sink.put1(0xC8 | (enc_reg & 7));
1230        }
1231
1232        Inst::Cmove {
1233            size,
1234            cc,
1235            consequent,
1236            alternative,
1237            dst,
1238        } => {
1239            let alternative = allocs.next(alternative.to_reg());
1240            let dst = allocs.next(dst.to_reg().to_reg());
1241            debug_assert_eq!(alternative, dst);
1242            let rex_flags = RexFlags::from(*size);
1243            let prefix = match size {
1244                OperandSize::Size16 => LegacyPrefixes::_66,
1245                OperandSize::Size32 => LegacyPrefixes::None,
1246                OperandSize::Size64 => LegacyPrefixes::None,
1247                _ => unreachable!("invalid size spec for cmove"),
1248            };
1249            let opcode = 0x0F40 + cc.get_enc() as u32;
1250            match consequent.clone().to_reg_mem() {
1251                RegMem::Reg { reg } => {
1252                    let reg = allocs.next(reg);
1253                    emit_std_reg_reg(sink, prefix, opcode, 2, dst, reg, rex_flags);
1254                }
1255                RegMem::Mem { addr } => {
1256                    let addr = &addr.finalize(state, sink).with_allocs(allocs);
1257                    emit_std_reg_mem(sink, prefix, opcode, 2, dst, addr, rex_flags, 0);
1258                }
1259            }
1260        }
1261
1262        Inst::XmmCmove {
1263            ty,
1264            cc,
1265            consequent,
1266            alternative,
1267            dst,
1268        } => {
1269            let alternative = allocs.next(alternative.to_reg());
1270            let dst = allocs.next(dst.to_reg().to_reg());
1271            debug_assert_eq!(alternative, dst);
1272            let consequent = consequent.clone().to_reg_mem().with_allocs(allocs);
1273
1274            // Lowering of the Select IR opcode when the input is an fcmp relies on the fact that
1275            // this doesn't clobber flags. Make sure to not do so here.
1276            let next = sink.get_label();
1277
1278            // Jump if cc is *not* set.
1279            one_way_jmp(sink, cc.invert(), next);
1280
1281            let op = match *ty {
1282                types::F64 => SseOpcode::Movsd,
1283                types::F32 => SseOpcode::Movsd,
1284                types::F32X4 => SseOpcode::Movaps,
1285                types::F64X2 => SseOpcode::Movapd,
1286                ty => {
1287                    debug_assert!(ty.is_vector() && ty.bytes() == 16);
1288                    SseOpcode::Movdqa
1289                }
1290            };
1291            let inst = Inst::xmm_unary_rm_r(op, consequent, Writable::from_reg(dst));
1292            inst.emit(&[], sink, info, state);
1293
1294            sink.bind_label(next);
1295        }
1296
1297        Inst::Push64 { src } => {
1298            let src = src.clone().to_reg_mem_imm().with_allocs(allocs);
1299
1300            match src {
1301                RegMemImm::Reg { reg } => {
1302                    let enc_reg = int_reg_enc(reg);
1303                    let rex = 0x40 | ((enc_reg >> 3) & 1);
1304                    if rex != 0x40 {
1305                        sink.put1(rex);
1306                    }
1307                    sink.put1(0x50 | (enc_reg & 7));
1308                }
1309
1310                RegMemImm::Mem { addr } => {
1311                    let addr = &addr.finalize(state, sink);
1312                    emit_std_enc_mem(
1313                        sink,
1314                        LegacyPrefixes::None,
1315                        0xFF,
1316                        1,
1317                        6, /*subopcode*/
1318                        addr,
1319                        RexFlags::clear_w(),
1320                        0,
1321                    );
1322                }
1323
1324                RegMemImm::Imm { simm32 } => {
1325                    if low8_will_sign_extend_to_64(simm32) {
1326                        sink.put1(0x6A);
1327                        sink.put1(simm32 as u8);
1328                    } else {
1329                        sink.put1(0x68);
1330                        sink.put4(simm32);
1331                    }
1332                }
1333            }
1334        }
1335
1336        Inst::Pop64 { dst } => {
1337            let dst = allocs.next(dst.to_reg().to_reg());
1338            let enc_dst = int_reg_enc(dst);
1339            if enc_dst >= 8 {
1340                // 0x41 == REX.{W=0, B=1}.  It seems that REX.W is irrelevant here.
1341                sink.put1(0x41);
1342            }
1343            sink.put1(0x58 + (enc_dst & 7));
1344        }
1345
1346        Inst::StackProbeLoop {
1347            tmp,
1348            frame_size,
1349            guard_size,
1350        } => {
1351            assert!(info.flags.enable_probestack());
1352            assert!(guard_size.is_power_of_two());
1353
1354            let tmp = allocs.next_writable(*tmp);
1355
1356            // Number of probes that we need to perform
1357            let probe_count = align_to(*frame_size, *guard_size) / guard_size;
1358
1359            // The inline stack probe loop has 3 phases:
1360            //
1361            // We generate the "guard area" register which is essentially the frame_size aligned to
1362            // guard_size. We copy the stack pointer and subtract the guard area from it. This
1363            // gets us a register that we can use to compare when looping.
1364            //
1365            // After that we emit the loop. Essentially we just adjust the stack pointer one guard_size'd
1366            // distance at a time and then touch the stack by writing anything to it. We use the previously
1367            // created "guard area" register to know when to stop looping.
1368            //
1369            // When we have touched all the pages that we need, we have to restore the stack pointer
1370            // to where it was before.
1371            //
1372            // Generate the following code:
1373            //         mov  tmp_reg, rsp
1374            //         sub  tmp_reg, guard_size * probe_count
1375            // .loop_start:
1376            //         sub  rsp, guard_size
1377            //         mov  [rsp], rsp
1378            //         cmp  rsp, tmp_reg
1379            //         jne  .loop_start
1380            //         add  rsp, guard_size * probe_count
1381
1382            // Create the guard bound register
1383            // mov  tmp_reg, rsp
1384            let inst = Inst::gen_move(tmp, regs::rsp(), types::I64);
1385            inst.emit(&[], sink, info, state);
1386
1387            // sub  tmp_reg, GUARD_SIZE * probe_count
1388            let inst = Inst::alu_rmi_r(
1389                OperandSize::Size64,
1390                AluRmiROpcode::Sub,
1391                RegMemImm::imm(guard_size * probe_count),
1392                tmp,
1393            );
1394            inst.emit(&[], sink, info, state);
1395
1396            // Emit the main loop!
1397            let loop_start = sink.get_label();
1398            sink.bind_label(loop_start);
1399
1400            // sub  rsp, GUARD_SIZE
1401            let inst = Inst::alu_rmi_r(
1402                OperandSize::Size64,
1403                AluRmiROpcode::Sub,
1404                RegMemImm::imm(*guard_size),
1405                Writable::from_reg(regs::rsp()),
1406            );
1407            inst.emit(&[], sink, info, state);
1408
1409            // TODO: `mov [rsp], 0` would be better, but we don't have that instruction
1410            // Probe the stack! We don't use Inst::gen_store_stack here because we need a predictable
1411            // instruction size.
1412            // mov  [rsp], rsp
1413            let inst = Inst::mov_r_m(
1414                OperandSize::Size32, // Use Size32 since it saves us one byte
1415                regs::rsp(),
1416                SyntheticAmode::Real(Amode::imm_reg(0, regs::rsp())),
1417            );
1418            inst.emit(&[], sink, info, state);
1419
1420            // Compare and jump if we are not done yet
1421            // cmp  rsp, tmp_reg
1422            let inst = Inst::cmp_rmi_r(
1423                OperandSize::Size64,
1424                RegMemImm::reg(regs::rsp()),
1425                tmp.to_reg(),
1426            );
1427            inst.emit(&[], sink, info, state);
1428
1429            // jne  .loop_start
1430            // TODO: Encoding the JmpIf as a short jump saves us 4 bytes here.
1431            one_way_jmp(sink, CC::NZ, loop_start);
1432
1433            // The regular prologue code is going to emit a `sub` after this, so we need to
1434            // reset the stack pointer
1435            //
1436            // TODO: It would be better if we could avoid the `add` + `sub` that is generated here
1437            // and in the stack adj portion of the prologue
1438            //
1439            // add rsp, GUARD_SIZE * probe_count
1440            let inst = Inst::alu_rmi_r(
1441                OperandSize::Size64,
1442                AluRmiROpcode::Add,
1443                RegMemImm::imm(guard_size * probe_count),
1444                Writable::from_reg(regs::rsp()),
1445            );
1446            inst.emit(&[], sink, info, state);
1447        }
1448
1449        Inst::CallKnown {
1450            dest,
1451            info: call_info,
1452            ..
1453        } => {
1454            if let Some(s) = state.take_stack_map() {
1455                sink.add_stack_map(StackMapExtent::UpcomingBytes(5), s);
1456            }
1457            sink.put1(0xE8);
1458            // The addend adjusts for the difference between the end of the instruction and the
1459            // beginning of the immediate field.
1460            emit_reloc(sink, Reloc::X86CallPCRel4, &dest, -4);
1461            sink.put4(0);
1462            if call_info.opcode.is_call() {
1463                sink.add_call_site(call_info.opcode);
1464            }
1465        }
1466
1467        Inst::CallUnknown {
1468            dest,
1469            info: call_info,
1470            ..
1471        } => {
1472            let dest = dest.with_allocs(allocs);
1473
1474            let start_offset = sink.cur_offset();
1475            match dest {
1476                RegMem::Reg { reg } => {
1477                    let reg_enc = int_reg_enc(reg);
1478                    emit_std_enc_enc(
1479                        sink,
1480                        LegacyPrefixes::None,
1481                        0xFF,
1482                        1,
1483                        2, /*subopcode*/
1484                        reg_enc,
1485                        RexFlags::clear_w(),
1486                    );
1487                }
1488
1489                RegMem::Mem { addr } => {
1490                    let addr = &addr.finalize(state, sink);
1491                    emit_std_enc_mem(
1492                        sink,
1493                        LegacyPrefixes::None,
1494                        0xFF,
1495                        1,
1496                        2, /*subopcode*/
1497                        addr,
1498                        RexFlags::clear_w(),
1499                        0,
1500                    );
1501                }
1502            }
1503            if let Some(s) = state.take_stack_map() {
1504                sink.add_stack_map(StackMapExtent::StartedAtOffset(start_offset), s);
1505            }
1506            if call_info.opcode.is_call() {
1507                sink.add_call_site(call_info.opcode);
1508            }
1509        }
1510
1511        Inst::Args { .. } => {}
1512
1513        Inst::Ret { .. } => sink.put1(0xC3),
1514
1515        Inst::JmpKnown { dst } => {
1516            let br_start = sink.cur_offset();
1517            let br_disp_off = br_start + 1;
1518            let br_end = br_start + 5;
1519
1520            sink.use_label_at_offset(br_disp_off, *dst, LabelUse::JmpRel32);
1521            sink.add_uncond_branch(br_start, br_end, *dst);
1522
1523            sink.put1(0xE9);
1524            // Placeholder for the label value.
1525            sink.put4(0x0);
1526        }
1527
1528        Inst::JmpIf { cc, taken } => {
1529            let cond_start = sink.cur_offset();
1530            let cond_disp_off = cond_start + 2;
1531
1532            sink.use_label_at_offset(cond_disp_off, *taken, LabelUse::JmpRel32);
1533            // Since this is not a terminator, don't enroll in the branch inversion mechanism.
1534
1535            sink.put1(0x0F);
1536            sink.put1(0x80 + cc.get_enc());
1537            // Placeholder for the label value.
1538            sink.put4(0x0);
1539        }
1540
1541        Inst::JmpCond {
1542            cc,
1543            taken,
1544            not_taken,
1545        } => {
1546            // If taken.
1547            let cond_start = sink.cur_offset();
1548            let cond_disp_off = cond_start + 2;
1549            let cond_end = cond_start + 6;
1550
1551            sink.use_label_at_offset(cond_disp_off, *taken, LabelUse::JmpRel32);
1552            let inverted: [u8; 6] = [0x0F, 0x80 + (cc.invert().get_enc()), 0x00, 0x00, 0x00, 0x00];
1553            sink.add_cond_branch(cond_start, cond_end, *taken, &inverted[..]);
1554
1555            sink.put1(0x0F);
1556            sink.put1(0x80 + cc.get_enc());
1557            // Placeholder for the label value.
1558            sink.put4(0x0);
1559
1560            // If not taken.
1561            let uncond_start = sink.cur_offset();
1562            let uncond_disp_off = uncond_start + 1;
1563            let uncond_end = uncond_start + 5;
1564
1565            sink.use_label_at_offset(uncond_disp_off, *not_taken, LabelUse::JmpRel32);
1566            sink.add_uncond_branch(uncond_start, uncond_end, *not_taken);
1567
1568            sink.put1(0xE9);
1569            // Placeholder for the label value.
1570            sink.put4(0x0);
1571        }
1572
1573        Inst::JmpUnknown { target } => {
1574            let target = target.with_allocs(allocs);
1575
1576            match target {
1577                RegMem::Reg { reg } => {
1578                    let reg_enc = int_reg_enc(reg);
1579                    emit_std_enc_enc(
1580                        sink,
1581                        LegacyPrefixes::None,
1582                        0xFF,
1583                        1,
1584                        4, /*subopcode*/
1585                        reg_enc,
1586                        RexFlags::clear_w(),
1587                    );
1588                }
1589
1590                RegMem::Mem { addr } => {
1591                    let addr = &addr.finalize(state, sink);
1592                    emit_std_enc_mem(
1593                        sink,
1594                        LegacyPrefixes::None,
1595                        0xFF,
1596                        1,
1597                        4, /*subopcode*/
1598                        addr,
1599                        RexFlags::clear_w(),
1600                        0,
1601                    );
1602                }
1603            }
1604        }
1605
1606        Inst::JmpTableSeq {
1607            idx,
1608            tmp1,
1609            tmp2,
1610            ref targets,
1611            default_target,
1612            ..
1613        } => {
1614            let idx = allocs.next(*idx);
1615            let tmp1 = Writable::from_reg(allocs.next(tmp1.to_reg()));
1616            let tmp2 = Writable::from_reg(allocs.next(tmp2.to_reg()));
1617
1618            // This sequence is *one* instruction in the vcode, and is expanded only here at
1619            // emission time, because we cannot allow the regalloc to insert spills/reloads in
1620            // the middle; we depend on hardcoded PC-rel addressing below.
1621            //
1622            // We don't have to worry about emitting islands, because the only label-use type has a
1623            // maximum range of 2 GB. If we later consider using shorter-range label references,
1624            // this will need to be revisited.
1625
1626            // We generate the following sequence. Note that the only read of %idx is before the
1627            // write to %tmp2, so regalloc may use the same register for both; fix x64/inst/mod.rs
1628            // if you change this.
1629            // lea start_of_jump_table_offset(%rip), %tmp1
1630            // movslq [%tmp1, %idx, 4], %tmp2 ;; shift of 2, viz. multiply index by 4
1631            // addq %tmp2, %tmp1
1632            // j *%tmp1
1633            // $start_of_jump_table:
1634            // -- jump table entries
1635
1636            // Load base address of jump table.
1637            let start_of_jumptable = sink.get_label();
1638            let inst = Inst::lea(Amode::rip_relative(start_of_jumptable), tmp1);
1639            inst.emit(&[], sink, info, state);
1640
1641            // Load value out of the jump table. It's a relative offset to the target block, so it
1642            // might be negative; use a sign-extension.
1643            let inst = Inst::movsx_rm_r(
1644                ExtMode::LQ,
1645                RegMem::mem(Amode::imm_reg_reg_shift(
1646                    0,
1647                    Gpr::new(tmp1.to_reg()).unwrap(),
1648                    Gpr::new(idx).unwrap(),
1649                    2,
1650                )),
1651                tmp2,
1652            );
1653            inst.emit(&[], sink, info, state);
1654
1655            // Add base of jump table to jump-table-sourced block offset.
1656            let inst = Inst::alu_rmi_r(
1657                OperandSize::Size64,
1658                AluRmiROpcode::Add,
1659                RegMemImm::reg(tmp2.to_reg()),
1660                tmp1,
1661            );
1662            inst.emit(&[], sink, info, state);
1663
1664            // Branch to computed address.
1665            let inst = Inst::jmp_unknown(RegMem::reg(tmp1.to_reg()));
1666            inst.emit(&[], sink, info, state);
1667
1668            // Emit jump table (table of 32-bit offsets).
1669            sink.bind_label(start_of_jumptable);
1670            let jt_off = sink.cur_offset();
1671            for &target in targets.iter().chain(std::iter::once(default_target)) {
1672                let word_off = sink.cur_offset();
1673                // off_into_table is an addend here embedded in the label to be later patched at
1674                // the end of codegen. The offset is initially relative to this jump table entry;
1675                // with the extra addend, it'll be relative to the jump table's start, after
1676                // patching.
1677                let off_into_table = word_off - jt_off;
1678                sink.use_label_at_offset(word_off, target, LabelUse::PCRel32);
1679                sink.put4(off_into_table);
1680            }
1681        }
1682
1683        Inst::TrapIf { cc, trap_code } => {
1684            let trap_label = sink.defer_trap(*trap_code, state.take_stack_map());
1685            one_way_jmp(sink, *cc, trap_label);
1686        }
1687
1688        Inst::TrapIfAnd {
1689            cc1,
1690            cc2,
1691            trap_code,
1692        } => {
1693            let trap_label = sink.defer_trap(*trap_code, state.take_stack_map());
1694            let else_label = sink.get_label();
1695
1696            // Jump to the end if the first condition isn't true, and then if
1697            // the second condition is true go to the trap.
1698            one_way_jmp(sink, cc1.invert(), else_label);
1699            one_way_jmp(sink, *cc2, trap_label);
1700
1701            sink.bind_label(else_label);
1702        }
1703
1704        Inst::TrapIfOr {
1705            cc1,
1706            cc2,
1707            trap_code,
1708        } => {
1709            let trap_label = sink.defer_trap(*trap_code, state.take_stack_map());
1710
1711            // Emit two jumps to the same trap if either condition code is true.
1712            one_way_jmp(sink, *cc1, trap_label);
1713            one_way_jmp(sink, *cc2, trap_label);
1714        }
1715
1716        Inst::XmmUnaryRmR { op, src, dst } => {
1717            emit(
1718                &Inst::XmmUnaryRmRUnaligned {
1719                    op: *op,
1720                    src: XmmMem::new(src.clone().into()).unwrap(),
1721                    dst: *dst,
1722                },
1723                allocs,
1724                sink,
1725                info,
1726                state,
1727            );
1728        }
1729
1730        Inst::XmmUnaryRmRUnaligned {
1731            op,
1732            src: src_e,
1733            dst: reg_g,
1734        } => {
1735            let reg_g = allocs.next(reg_g.to_reg().to_reg());
1736            let src_e = src_e.clone().to_reg_mem().with_allocs(allocs);
1737
1738            let rex = RexFlags::clear_w();
1739
1740            let (prefix, opcode, num_opcodes) = match op {
1741                SseOpcode::Cvtdq2pd => (LegacyPrefixes::_F3, 0x0FE6, 2),
1742                SseOpcode::Cvtpd2ps => (LegacyPrefixes::_66, 0x0F5A, 2),
1743                SseOpcode::Cvtps2pd => (LegacyPrefixes::None, 0x0F5A, 2),
1744                SseOpcode::Cvtdq2ps => (LegacyPrefixes::None, 0x0F5B, 2),
1745                SseOpcode::Cvtss2sd => (LegacyPrefixes::_F3, 0x0F5A, 2),
1746                SseOpcode::Cvtsd2ss => (LegacyPrefixes::_F2, 0x0F5A, 2),
1747                SseOpcode::Cvttpd2dq => (LegacyPrefixes::_66, 0x0FE6, 2),
1748                SseOpcode::Cvttps2dq => (LegacyPrefixes::_F3, 0x0F5B, 2),
1749                SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F28, 2),
1750                SseOpcode::Movapd => (LegacyPrefixes::_66, 0x0F28, 2),
1751                SseOpcode::Movdqa => (LegacyPrefixes::_66, 0x0F6F, 2),
1752                SseOpcode::Movdqu => (LegacyPrefixes::_F3, 0x0F6F, 2),
1753                SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F10, 2),
1754                SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F10, 2),
1755                SseOpcode::Movups => (LegacyPrefixes::None, 0x0F10, 2),
1756                SseOpcode::Movupd => (LegacyPrefixes::_66, 0x0F10, 2),
1757                SseOpcode::Pabsb => (LegacyPrefixes::_66, 0x0F381C, 3),
1758                SseOpcode::Pabsw => (LegacyPrefixes::_66, 0x0F381D, 3),
1759                SseOpcode::Pabsd => (LegacyPrefixes::_66, 0x0F381E, 3),
1760                SseOpcode::Pmovsxbd => (LegacyPrefixes::_66, 0x0F3821, 3),
1761                SseOpcode::Pmovsxbw => (LegacyPrefixes::_66, 0x0F3820, 3),
1762                SseOpcode::Pmovsxbq => (LegacyPrefixes::_66, 0x0F3822, 3),
1763                SseOpcode::Pmovsxwd => (LegacyPrefixes::_66, 0x0F3823, 3),
1764                SseOpcode::Pmovsxwq => (LegacyPrefixes::_66, 0x0F3824, 3),
1765                SseOpcode::Pmovsxdq => (LegacyPrefixes::_66, 0x0F3825, 3),
1766                SseOpcode::Pmovzxbd => (LegacyPrefixes::_66, 0x0F3831, 3),
1767                SseOpcode::Pmovzxbw => (LegacyPrefixes::_66, 0x0F3830, 3),
1768                SseOpcode::Pmovzxbq => (LegacyPrefixes::_66, 0x0F3832, 3),
1769                SseOpcode::Pmovzxwd => (LegacyPrefixes::_66, 0x0F3833, 3),
1770                SseOpcode::Pmovzxwq => (LegacyPrefixes::_66, 0x0F3834, 3),
1771                SseOpcode::Pmovzxdq => (LegacyPrefixes::_66, 0x0F3835, 3),
1772                SseOpcode::Sqrtps => (LegacyPrefixes::None, 0x0F51, 2),
1773                SseOpcode::Sqrtpd => (LegacyPrefixes::_66, 0x0F51, 2),
1774                SseOpcode::Sqrtss => (LegacyPrefixes::_F3, 0x0F51, 2),
1775                SseOpcode::Sqrtsd => (LegacyPrefixes::_F2, 0x0F51, 2),
1776                SseOpcode::Movddup => (LegacyPrefixes::_F2, 0x0F12, 2),
1777                _ => unimplemented!("Opcode {:?} not implemented", op),
1778            };
1779
1780            match src_e {
1781                RegMem::Reg { reg: reg_e } => {
1782                    emit_std_reg_reg(sink, prefix, opcode, num_opcodes, reg_g, reg_e, rex);
1783                }
1784                RegMem::Mem { addr } => {
1785                    let addr = &addr.finalize(state, sink);
1786                    emit_std_reg_mem(sink, prefix, opcode, num_opcodes, reg_g, addr, rex, 0);
1787                }
1788            };
1789        }
1790
1791        Inst::XmmUnaryRmRImm { op, src, dst, imm } => {
1792            let dst = allocs.next(dst.to_reg().to_reg());
1793            let src = src.clone().to_reg_mem().with_allocs(allocs);
1794            let rex = RexFlags::clear_w();
1795
1796            let (prefix, opcode, len) = match op {
1797                SseOpcode::Roundps => (LegacyPrefixes::_66, 0x0F3A08, 3),
1798                SseOpcode::Roundss => (LegacyPrefixes::_66, 0x0F3A0A, 3),
1799                SseOpcode::Roundpd => (LegacyPrefixes::_66, 0x0F3A09, 3),
1800                SseOpcode::Roundsd => (LegacyPrefixes::_66, 0x0F3A0B, 3),
1801                SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2),
1802                SseOpcode::Pshuflw => (LegacyPrefixes::_F2, 0x0F70, 2),
1803                SseOpcode::Pshufhw => (LegacyPrefixes::_F3, 0x0F70, 2),
1804                _ => unimplemented!("Opcode {:?} not implemented", op),
1805            };
1806            match src {
1807                RegMem::Reg { reg } => {
1808                    emit_std_reg_reg(sink, prefix, opcode, len, dst, reg, rex);
1809                }
1810                RegMem::Mem { addr } => {
1811                    let addr = &addr.finalize(state, sink);
1812                    // N.B.: bytes_at_end == 1, because of the `imm` byte below.
1813                    emit_std_reg_mem(sink, prefix, opcode, len, dst, addr, rex, 1);
1814                }
1815            }
1816            sink.put1(*imm);
1817        }
1818
1819        Inst::XmmUnaryRmREvex { op, src, dst } => {
1820            let dst = allocs.next(dst.to_reg().to_reg());
1821            let src = src.clone().to_reg_mem().with_allocs(allocs);
1822
1823            let (prefix, map, w, opcode) = match op {
1824                Avx512Opcode::Vcvtudq2ps => (LegacyPrefixes::_F2, OpcodeMap::_0F, false, 0x7a),
1825                Avx512Opcode::Vpabsq => (LegacyPrefixes::_66, OpcodeMap::_0F38, true, 0x1f),
1826                Avx512Opcode::Vpopcntb => (LegacyPrefixes::_66, OpcodeMap::_0F38, false, 0x54),
1827                _ => unimplemented!("Opcode {:?} not implemented", op),
1828            };
1829            match src {
1830                RegMem::Reg { reg: src } => EvexInstruction::new()
1831                    .length(EvexVectorLength::V128)
1832                    .prefix(prefix)
1833                    .map(map)
1834                    .w(w)
1835                    .opcode(opcode)
1836                    .reg(dst.to_real_reg().unwrap().hw_enc())
1837                    .rm(src.to_real_reg().unwrap().hw_enc())
1838                    .encode(sink),
1839                _ => todo!(),
1840            };
1841        }
1842
1843        Inst::XmmRmR {
1844            op,
1845            src1,
1846            src2,
1847            dst,
1848        } => emit(
1849            &Inst::XmmRmRUnaligned {
1850                op: *op,
1851                dst: *dst,
1852                src1: *src1,
1853                src2: XmmMem::new(src2.clone().to_reg_mem()).unwrap(),
1854            },
1855            allocs,
1856            sink,
1857            info,
1858            state,
1859        ),
1860
1861        Inst::XmmRmRUnaligned {
1862            op,
1863            src1,
1864            src2: src_e,
1865            dst: reg_g,
1866        } => {
1867            let src1 = allocs.next(src1.to_reg());
1868            let reg_g = allocs.next(reg_g.to_reg().to_reg());
1869            let src_e = src_e.clone().to_reg_mem().with_allocs(allocs);
1870            debug_assert_eq!(src1, reg_g);
1871
1872            let rex = RexFlags::clear_w();
1873            let (prefix, opcode, length) = match op {
1874                SseOpcode::Addps => (LegacyPrefixes::None, 0x0F58, 2),
1875                SseOpcode::Addpd => (LegacyPrefixes::_66, 0x0F58, 2),
1876                SseOpcode::Addss => (LegacyPrefixes::_F3, 0x0F58, 2),
1877                SseOpcode::Addsd => (LegacyPrefixes::_F2, 0x0F58, 2),
1878                SseOpcode::Andps => (LegacyPrefixes::None, 0x0F54, 2),
1879                SseOpcode::Andpd => (LegacyPrefixes::_66, 0x0F54, 2),
1880                SseOpcode::Andnps => (LegacyPrefixes::None, 0x0F55, 2),
1881                SseOpcode::Andnpd => (LegacyPrefixes::_66, 0x0F55, 2),
1882                SseOpcode::Divps => (LegacyPrefixes::None, 0x0F5E, 2),
1883                SseOpcode::Divpd => (LegacyPrefixes::_66, 0x0F5E, 2),
1884                SseOpcode::Divss => (LegacyPrefixes::_F3, 0x0F5E, 2),
1885                SseOpcode::Divsd => (LegacyPrefixes::_F2, 0x0F5E, 2),
1886                SseOpcode::Maxps => (LegacyPrefixes::None, 0x0F5F, 2),
1887                SseOpcode::Maxpd => (LegacyPrefixes::_66, 0x0F5F, 2),
1888                SseOpcode::Maxss => (LegacyPrefixes::_F3, 0x0F5F, 2),
1889                SseOpcode::Maxsd => (LegacyPrefixes::_F2, 0x0F5F, 2),
1890                SseOpcode::Minps => (LegacyPrefixes::None, 0x0F5D, 2),
1891                SseOpcode::Minpd => (LegacyPrefixes::_66, 0x0F5D, 2),
1892                SseOpcode::Minss => (LegacyPrefixes::_F3, 0x0F5D, 2),
1893                SseOpcode::Minsd => (LegacyPrefixes::_F2, 0x0F5D, 2),
1894                SseOpcode::Movlhps => (LegacyPrefixes::None, 0x0F16, 2),
1895                SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F10, 2),
1896                SseOpcode::Mulps => (LegacyPrefixes::None, 0x0F59, 2),
1897                SseOpcode::Mulpd => (LegacyPrefixes::_66, 0x0F59, 2),
1898                SseOpcode::Mulss => (LegacyPrefixes::_F3, 0x0F59, 2),
1899                SseOpcode::Mulsd => (LegacyPrefixes::_F2, 0x0F59, 2),
1900                SseOpcode::Orpd => (LegacyPrefixes::_66, 0x0F56, 2),
1901                SseOpcode::Orps => (LegacyPrefixes::None, 0x0F56, 2),
1902                SseOpcode::Packssdw => (LegacyPrefixes::_66, 0x0F6B, 2),
1903                SseOpcode::Packsswb => (LegacyPrefixes::_66, 0x0F63, 2),
1904                SseOpcode::Packusdw => (LegacyPrefixes::_66, 0x0F382B, 3),
1905                SseOpcode::Packuswb => (LegacyPrefixes::_66, 0x0F67, 2),
1906                SseOpcode::Paddb => (LegacyPrefixes::_66, 0x0FFC, 2),
1907                SseOpcode::Paddd => (LegacyPrefixes::_66, 0x0FFE, 2),
1908                SseOpcode::Paddq => (LegacyPrefixes::_66, 0x0FD4, 2),
1909                SseOpcode::Paddw => (LegacyPrefixes::_66, 0x0FFD, 2),
1910                SseOpcode::Paddsb => (LegacyPrefixes::_66, 0x0FEC, 2),
1911                SseOpcode::Paddsw => (LegacyPrefixes::_66, 0x0FED, 2),
1912                SseOpcode::Paddusb => (LegacyPrefixes::_66, 0x0FDC, 2),
1913                SseOpcode::Paddusw => (LegacyPrefixes::_66, 0x0FDD, 2),
1914                SseOpcode::Pmaddubsw => (LegacyPrefixes::_66, 0x0F3804, 3),
1915                SseOpcode::Pand => (LegacyPrefixes::_66, 0x0FDB, 2),
1916                SseOpcode::Pandn => (LegacyPrefixes::_66, 0x0FDF, 2),
1917                SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2),
1918                SseOpcode::Pavgw => (LegacyPrefixes::_66, 0x0FE3, 2),
1919                SseOpcode::Pcmpeqb => (LegacyPrefixes::_66, 0x0F74, 2),
1920                SseOpcode::Pcmpeqw => (LegacyPrefixes::_66, 0x0F75, 2),
1921                SseOpcode::Pcmpeqd => (LegacyPrefixes::_66, 0x0F76, 2),
1922                SseOpcode::Pcmpeqq => (LegacyPrefixes::_66, 0x0F3829, 3),
1923                SseOpcode::Pcmpgtb => (LegacyPrefixes::_66, 0x0F64, 2),
1924                SseOpcode::Pcmpgtw => (LegacyPrefixes::_66, 0x0F65, 2),
1925                SseOpcode::Pcmpgtd => (LegacyPrefixes::_66, 0x0F66, 2),
1926                SseOpcode::Pcmpgtq => (LegacyPrefixes::_66, 0x0F3837, 3),
1927                SseOpcode::Pmaddwd => (LegacyPrefixes::_66, 0x0FF5, 2),
1928                SseOpcode::Pmaxsb => (LegacyPrefixes::_66, 0x0F383C, 3),
1929                SseOpcode::Pmaxsw => (LegacyPrefixes::_66, 0x0FEE, 2),
1930                SseOpcode::Pmaxsd => (LegacyPrefixes::_66, 0x0F383D, 3),
1931                SseOpcode::Pmaxub => (LegacyPrefixes::_66, 0x0FDE, 2),
1932                SseOpcode::Pmaxuw => (LegacyPrefixes::_66, 0x0F383E, 3),
1933                SseOpcode::Pmaxud => (LegacyPrefixes::_66, 0x0F383F, 3),
1934                SseOpcode::Pminsb => (LegacyPrefixes::_66, 0x0F3838, 3),
1935                SseOpcode::Pminsw => (LegacyPrefixes::_66, 0x0FEA, 2),
1936                SseOpcode::Pminsd => (LegacyPrefixes::_66, 0x0F3839, 3),
1937                SseOpcode::Pminub => (LegacyPrefixes::_66, 0x0FDA, 2),
1938                SseOpcode::Pminuw => (LegacyPrefixes::_66, 0x0F383A, 3),
1939                SseOpcode::Pminud => (LegacyPrefixes::_66, 0x0F383B, 3),
1940                SseOpcode::Pmuldq => (LegacyPrefixes::_66, 0x0F3828, 3),
1941                SseOpcode::Pmulhw => (LegacyPrefixes::_66, 0x0FE5, 2),
1942                SseOpcode::Pmulhrsw => (LegacyPrefixes::_66, 0x0F380B, 3),
1943                SseOpcode::Pmulhuw => (LegacyPrefixes::_66, 0x0FE4, 2),
1944                SseOpcode::Pmulld => (LegacyPrefixes::_66, 0x0F3840, 3),
1945                SseOpcode::Pmullw => (LegacyPrefixes::_66, 0x0FD5, 2),
1946                SseOpcode::Pmuludq => (LegacyPrefixes::_66, 0x0FF4, 2),
1947                SseOpcode::Por => (LegacyPrefixes::_66, 0x0FEB, 2),
1948                SseOpcode::Pshufb => (LegacyPrefixes::_66, 0x0F3800, 3),
1949                SseOpcode::Psubb => (LegacyPrefixes::_66, 0x0FF8, 2),
1950                SseOpcode::Psubd => (LegacyPrefixes::_66, 0x0FFA, 2),
1951                SseOpcode::Psubq => (LegacyPrefixes::_66, 0x0FFB, 2),
1952                SseOpcode::Psubw => (LegacyPrefixes::_66, 0x0FF9, 2),
1953                SseOpcode::Psubsb => (LegacyPrefixes::_66, 0x0FE8, 2),
1954                SseOpcode::Psubsw => (LegacyPrefixes::_66, 0x0FE9, 2),
1955                SseOpcode::Psubusb => (LegacyPrefixes::_66, 0x0FD8, 2),
1956                SseOpcode::Psubusw => (LegacyPrefixes::_66, 0x0FD9, 2),
1957                SseOpcode::Punpckhbw => (LegacyPrefixes::_66, 0x0F68, 2),
1958                SseOpcode::Punpckhwd => (LegacyPrefixes::_66, 0x0F69, 2),
1959                SseOpcode::Punpcklbw => (LegacyPrefixes::_66, 0x0F60, 2),
1960                SseOpcode::Punpcklwd => (LegacyPrefixes::_66, 0x0F61, 2),
1961                SseOpcode::Punpckldq => (LegacyPrefixes::_66, 0x0F62, 2),
1962                SseOpcode::Punpcklqdq => (LegacyPrefixes::_66, 0x0F6C, 2),
1963                SseOpcode::Punpckhdq => (LegacyPrefixes::_66, 0x0F6A, 2),
1964                SseOpcode::Punpckhqdq => (LegacyPrefixes::_66, 0x0F6D, 2),
1965                SseOpcode::Pxor => (LegacyPrefixes::_66, 0x0FEF, 2),
1966                SseOpcode::Subps => (LegacyPrefixes::None, 0x0F5C, 2),
1967                SseOpcode::Subpd => (LegacyPrefixes::_66, 0x0F5C, 2),
1968                SseOpcode::Subss => (LegacyPrefixes::_F3, 0x0F5C, 2),
1969                SseOpcode::Subsd => (LegacyPrefixes::_F2, 0x0F5C, 2),
1970                SseOpcode::Unpcklps => (LegacyPrefixes::None, 0x0F14, 2),
1971                SseOpcode::Xorps => (LegacyPrefixes::None, 0x0F57, 2),
1972                SseOpcode::Xorpd => (LegacyPrefixes::_66, 0x0F57, 2),
1973                SseOpcode::Phaddw => (LegacyPrefixes::_66, 0x0F3801, 3),
1974                SseOpcode::Phaddd => (LegacyPrefixes::_66, 0x0F3802, 3),
1975                _ => unimplemented!("Opcode {:?} not implemented", op),
1976            };
1977
1978            match src_e {
1979                RegMem::Reg { reg: reg_e } => {
1980                    emit_std_reg_reg(sink, prefix, opcode, length, reg_g, reg_e, rex);
1981                }
1982                RegMem::Mem { addr } => {
1983                    let addr = &addr.finalize(state, sink);
1984                    emit_std_reg_mem(sink, prefix, opcode, length, reg_g, addr, rex, 0);
1985                }
1986            }
1987        }
1988
1989        Inst::XmmRmRBlend {
1990            op,
1991            src1,
1992            src2,
1993            dst,
1994            mask,
1995        } => {
1996            let src1 = allocs.next(src1.to_reg());
1997            let mask = allocs.next(mask.to_reg());
1998            debug_assert_eq!(mask, regs::xmm0());
1999            let reg_g = allocs.next(dst.to_reg().to_reg());
2000            debug_assert_eq!(src1, reg_g);
2001            let src_e = src2.clone().to_reg_mem().with_allocs(allocs);
2002
2003            let rex = RexFlags::clear_w();
2004            let (prefix, opcode, length) = match op {
2005                SseOpcode::Blendvps => (LegacyPrefixes::_66, 0x0F3814, 3),
2006                SseOpcode::Blendvpd => (LegacyPrefixes::_66, 0x0F3815, 3),
2007                SseOpcode::Pblendvb => (LegacyPrefixes::_66, 0x0F3810, 3),
2008                _ => unimplemented!("Opcode {:?} not implemented", op),
2009            };
2010
2011            match src_e {
2012                RegMem::Reg { reg: reg_e } => {
2013                    emit_std_reg_reg(sink, prefix, opcode, length, reg_g, reg_e, rex);
2014                }
2015                RegMem::Mem { addr } => {
2016                    let addr = &addr.finalize(state, sink);
2017                    emit_std_reg_mem(sink, prefix, opcode, length, reg_g, addr, rex, 0);
2018                }
2019            }
2020        }
2021
2022        Inst::XmmRmiRVex {
2023            op,
2024            src1,
2025            src2,
2026            dst,
2027        } => {
2028            use LegacyPrefixes as LP;
2029            use OpcodeMap as OM;
2030
2031            let dst = allocs.next(dst.to_reg().to_reg());
2032            let src1 = allocs.next(src1.to_reg());
2033            let src2 = src2.clone().to_reg_mem_imm().with_allocs(allocs);
2034
2035            let src2 = match src2 {
2036                // For opcodes where one of the operands is an immediate the
2037                // encoding is a bit different, notably the usage of
2038                // `opcode_ext`, so handle that specially here.
2039                RegMemImm::Imm { simm32 } => {
2040                    let (opcode, opcode_ext, prefix) = match op {
2041                        AvxOpcode::Vpsrlw => (0x71, 2, LegacyPrefixes::_66),
2042                        AvxOpcode::Vpsrld => (0x72, 2, LegacyPrefixes::_66),
2043                        AvxOpcode::Vpsrlq => (0x73, 2, LegacyPrefixes::_66),
2044                        AvxOpcode::Vpsllw => (0x71, 6, LegacyPrefixes::_66),
2045                        AvxOpcode::Vpslld => (0x72, 6, LegacyPrefixes::_66),
2046                        AvxOpcode::Vpsllq => (0x73, 6, LegacyPrefixes::_66),
2047                        AvxOpcode::Vpsraw => (0x71, 4, LegacyPrefixes::_66),
2048                        AvxOpcode::Vpsrad => (0x72, 4, LegacyPrefixes::_66),
2049                        _ => panic!("unexpected rmi_r_vex opcode with immediate {op:?}"),
2050                    };
2051                    VexInstruction::new()
2052                        .length(VexVectorLength::V128)
2053                        .prefix(prefix)
2054                        .map(OpcodeMap::_0F)
2055                        .opcode(opcode)
2056                        .opcode_ext(opcode_ext)
2057                        .vvvv(dst.to_real_reg().unwrap().hw_enc())
2058                        .prefix(LegacyPrefixes::_66)
2059                        .rm(src1.to_real_reg().unwrap().hw_enc())
2060                        .imm(simm32.try_into().unwrap())
2061                        .encode(sink);
2062                    return;
2063                }
2064                RegMemImm::Reg { reg } => {
2065                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
2066                }
2067                RegMemImm::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
2068            };
2069            let (prefix, map, opcode) = match op {
2070                AvxOpcode::Vminps => (LP::None, OM::_0F, 0x5D),
2071                AvxOpcode::Vminpd => (LP::_66, OM::_0F, 0x5D),
2072                AvxOpcode::Vmaxps => (LP::None, OM::_0F, 0x5F),
2073                AvxOpcode::Vmaxpd => (LP::_66, OM::_0F, 0x5F),
2074                AvxOpcode::Vandnps => (LP::None, OM::_0F, 0x55),
2075                AvxOpcode::Vandnpd => (LP::_66, OM::_0F, 0x55),
2076                AvxOpcode::Vpandn => (LP::_66, OM::_0F, 0xDF),
2077                AvxOpcode::Vpsrlw => (LP::_66, OM::_0F, 0xD1),
2078                AvxOpcode::Vpsrld => (LP::_66, OM::_0F, 0xD2),
2079                AvxOpcode::Vpsrlq => (LP::_66, OM::_0F, 0xD3),
2080                AvxOpcode::Vpaddb => (LP::_66, OM::_0F, 0xFC),
2081                AvxOpcode::Vpaddw => (LP::_66, OM::_0F, 0xFD),
2082                AvxOpcode::Vpaddd => (LP::_66, OM::_0F, 0xFE),
2083                AvxOpcode::Vpaddq => (LP::_66, OM::_0F, 0xD4),
2084                AvxOpcode::Vpaddsb => (LP::_66, OM::_0F, 0xEC),
2085                AvxOpcode::Vpaddsw => (LP::_66, OM::_0F, 0xED),
2086                AvxOpcode::Vpaddusb => (LP::_66, OM::_0F, 0xDC),
2087                AvxOpcode::Vpaddusw => (LP::_66, OM::_0F, 0xDD),
2088                AvxOpcode::Vpsubb => (LP::_66, OM::_0F, 0xF8),
2089                AvxOpcode::Vpsubw => (LP::_66, OM::_0F, 0xF9),
2090                AvxOpcode::Vpsubd => (LP::_66, OM::_0F, 0xFA),
2091                AvxOpcode::Vpsubq => (LP::_66, OM::_0F, 0xFB),
2092                AvxOpcode::Vpsubsb => (LP::_66, OM::_0F, 0xE8),
2093                AvxOpcode::Vpsubsw => (LP::_66, OM::_0F, 0xE9),
2094                AvxOpcode::Vpsubusb => (LP::_66, OM::_0F, 0xD8),
2095                AvxOpcode::Vpsubusw => (LP::_66, OM::_0F, 0xD9),
2096                AvxOpcode::Vpavgb => (LP::_66, OM::_0F, 0xE0),
2097                AvxOpcode::Vpavgw => (LP::_66, OM::_0F, 0xE3),
2098                AvxOpcode::Vpand => (LP::_66, OM::_0F, 0xDB),
2099                AvxOpcode::Vandps => (LP::None, OM::_0F, 0x54),
2100                AvxOpcode::Vandpd => (LP::_66, OM::_0F, 0x54),
2101                AvxOpcode::Vpor => (LP::_66, OM::_0F, 0xEB),
2102                AvxOpcode::Vorps => (LP::None, OM::_0F, 0x56),
2103                AvxOpcode::Vorpd => (LP::_66, OM::_0F, 0x56),
2104                AvxOpcode::Vpxor => (LP::_66, OM::_0F, 0xEF),
2105                AvxOpcode::Vxorps => (LP::None, OM::_0F, 0x57),
2106                AvxOpcode::Vxorpd => (LP::_66, OM::_0F, 0x57),
2107                AvxOpcode::Vpmullw => (LP::_66, OM::_0F, 0xD5),
2108                AvxOpcode::Vpmulld => (LP::_66, OM::_0F38, 0x40),
2109                AvxOpcode::Vpmulhw => (LP::_66, OM::_0F, 0xE5),
2110                AvxOpcode::Vpmulhrsw => (LP::_66, OM::_0F38, 0x0B),
2111                AvxOpcode::Vpmulhuw => (LP::_66, OM::_0F, 0xE4),
2112                AvxOpcode::Vpmuldq => (LP::_66, OM::_0F38, 0x28),
2113                AvxOpcode::Vpmuludq => (LP::_66, OM::_0F, 0xF4),
2114                AvxOpcode::Vpunpckhwd => (LP::_66, OM::_0F, 0x69),
2115                AvxOpcode::Vpunpcklwd => (LP::_66, OM::_0F, 0x61),
2116                AvxOpcode::Vunpcklps => (LP::None, OM::_0F, 0x14),
2117                AvxOpcode::Vaddps => (LP::None, OM::_0F, 0x58),
2118                AvxOpcode::Vaddpd => (LP::_66, OM::_0F, 0x58),
2119                AvxOpcode::Vsubps => (LP::None, OM::_0F, 0x5C),
2120                AvxOpcode::Vsubpd => (LP::_66, OM::_0F, 0x5C),
2121                AvxOpcode::Vmulps => (LP::None, OM::_0F, 0x59),
2122                AvxOpcode::Vmulpd => (LP::_66, OM::_0F, 0x59),
2123                AvxOpcode::Vdivps => (LP::None, OM::_0F, 0x5E),
2124                AvxOpcode::Vdivpd => (LP::_66, OM::_0F, 0x5E),
2125                AvxOpcode::Vpcmpeqb => (LP::_66, OM::_0F, 0x74),
2126                AvxOpcode::Vpcmpeqw => (LP::_66, OM::_0F, 0x75),
2127                AvxOpcode::Vpcmpeqd => (LP::_66, OM::_0F, 0x76),
2128                AvxOpcode::Vpcmpeqq => (LP::_66, OM::_0F38, 0x29),
2129                AvxOpcode::Vpcmpgtb => (LP::_66, OM::_0F, 0x64),
2130                AvxOpcode::Vpcmpgtw => (LP::_66, OM::_0F, 0x65),
2131                AvxOpcode::Vpcmpgtd => (LP::_66, OM::_0F, 0x66),
2132                AvxOpcode::Vpcmpgtq => (LP::_66, OM::_0F38, 0x37),
2133                AvxOpcode::Vmovlhps => (LP::None, OM::_0F, 0x16),
2134                AvxOpcode::Vpminsb => (LP::_66, OM::_0F38, 0x38),
2135                AvxOpcode::Vpminsw => (LP::_66, OM::_0F, 0xEA),
2136                AvxOpcode::Vpminsd => (LP::_66, OM::_0F38, 0x39),
2137                AvxOpcode::Vpmaxsb => (LP::_66, OM::_0F38, 0x3C),
2138                AvxOpcode::Vpmaxsw => (LP::_66, OM::_0F, 0xEE),
2139                AvxOpcode::Vpmaxsd => (LP::_66, OM::_0F38, 0x3D),
2140                AvxOpcode::Vpminub => (LP::_66, OM::_0F, 0xDA),
2141                AvxOpcode::Vpminuw => (LP::_66, OM::_0F38, 0x3A),
2142                AvxOpcode::Vpminud => (LP::_66, OM::_0F38, 0x3B),
2143                AvxOpcode::Vpmaxub => (LP::_66, OM::_0F, 0xDE),
2144                AvxOpcode::Vpmaxuw => (LP::_66, OM::_0F38, 0x3E),
2145                AvxOpcode::Vpmaxud => (LP::_66, OM::_0F38, 0x3F),
2146                AvxOpcode::Vpunpcklbw => (LP::_66, OM::_0F, 0x60),
2147                AvxOpcode::Vpunpckhbw => (LP::_66, OM::_0F, 0x68),
2148                AvxOpcode::Vpacksswb => (LP::_66, OM::_0F, 0x63),
2149                AvxOpcode::Vpackssdw => (LP::_66, OM::_0F, 0x6B),
2150                AvxOpcode::Vpackuswb => (LP::_66, OM::_0F, 0x67),
2151                AvxOpcode::Vpackusdw => (LP::_66, OM::_0F38, 0x2B),
2152                AvxOpcode::Vpmaddwd => (LP::_66, OM::_0F, 0xF5),
2153                AvxOpcode::Vpmaddubsw => (LP::_66, OM::_0F38, 0x04),
2154                AvxOpcode::Vpshufb => (LP::_66, OM::_0F38, 0x00),
2155                AvxOpcode::Vpsllw => (LP::_66, OM::_0F, 0xF1),
2156                AvxOpcode::Vpslld => (LP::_66, OM::_0F, 0xF2),
2157                AvxOpcode::Vpsllq => (LP::_66, OM::_0F, 0xF3),
2158                AvxOpcode::Vpsraw => (LP::_66, OM::_0F, 0xE1),
2159                AvxOpcode::Vpsrad => (LP::_66, OM::_0F, 0xE2),
2160                AvxOpcode::Vaddss => (LP::_F3, OM::_0F, 0x58),
2161                AvxOpcode::Vaddsd => (LP::_F2, OM::_0F, 0x58),
2162                AvxOpcode::Vmulss => (LP::_F3, OM::_0F, 0x59),
2163                AvxOpcode::Vmulsd => (LP::_F2, OM::_0F, 0x59),
2164                AvxOpcode::Vsubss => (LP::_F3, OM::_0F, 0x5C),
2165                AvxOpcode::Vsubsd => (LP::_F2, OM::_0F, 0x5C),
2166                AvxOpcode::Vdivss => (LP::_F3, OM::_0F, 0x5E),
2167                AvxOpcode::Vdivsd => (LP::_F2, OM::_0F, 0x5E),
2168                AvxOpcode::Vminss => (LP::_F3, OM::_0F, 0x5D),
2169                AvxOpcode::Vminsd => (LP::_F2, OM::_0F, 0x5D),
2170                AvxOpcode::Vmaxss => (LP::_F3, OM::_0F, 0x5F),
2171                AvxOpcode::Vmaxsd => (LP::_F2, OM::_0F, 0x5F),
2172                AvxOpcode::Vphaddw => (LP::_66, OM::_0F38, 0x01),
2173                AvxOpcode::Vphaddd => (LP::_66, OM::_0F38, 0x02),
2174                AvxOpcode::Vpunpckldq => (LP::_66, OM::_0F, 0x62),
2175                AvxOpcode::Vpunpckhdq => (LP::_66, OM::_0F, 0x6A),
2176                AvxOpcode::Vpunpcklqdq => (LP::_66, OM::_0F, 0x6C),
2177                AvxOpcode::Vpunpckhqdq => (LP::_66, OM::_0F, 0x6D),
2178                AvxOpcode::Vmovsd => (LP::_F2, OM::_0F, 0x10),
2179                _ => panic!("unexpected rmir vex opcode {op:?}"),
2180            };
2181            VexInstruction::new()
2182                .length(VexVectorLength::V128)
2183                .prefix(prefix)
2184                .map(map)
2185                .opcode(opcode)
2186                .reg(dst.to_real_reg().unwrap().hw_enc())
2187                .vvvv(src1.to_real_reg().unwrap().hw_enc())
2188                .rm(src2)
2189                .encode(sink);
2190        }
2191
2192        Inst::XmmRmRImmVex {
2193            op,
2194            src1,
2195            src2,
2196            dst,
2197            imm,
2198        } => {
2199            let dst = allocs.next(dst.to_reg().to_reg());
2200            let src1 = allocs.next(src1.to_reg());
2201            let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) {
2202                RegMem::Reg { reg } => {
2203                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
2204                }
2205                RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
2206            };
2207
2208            let (w, prefix, map, opcode) = match op {
2209                AvxOpcode::Vcmpps => (false, LegacyPrefixes::None, OpcodeMap::_0F, 0xC2),
2210                AvxOpcode::Vcmppd => (false, LegacyPrefixes::_66, OpcodeMap::_0F, 0xC2),
2211                AvxOpcode::Vpalignr => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0F),
2212                AvxOpcode::Vinsertps => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x21),
2213                AvxOpcode::Vshufps => (false, LegacyPrefixes::None, OpcodeMap::_0F, 0xC6),
2214                AvxOpcode::Vpblendw => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0E),
2215                _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
2216            };
2217
2218            VexInstruction::new()
2219                .length(VexVectorLength::V128)
2220                .prefix(prefix)
2221                .map(map)
2222                .w(w)
2223                .opcode(opcode)
2224                .reg(dst.to_real_reg().unwrap().hw_enc())
2225                .vvvv(src1.to_real_reg().unwrap().hw_enc())
2226                .rm(src2)
2227                .imm(*imm)
2228                .encode(sink);
2229        }
2230
2231        Inst::XmmVexPinsr {
2232            op,
2233            src1,
2234            src2,
2235            dst,
2236            imm,
2237        } => {
2238            let dst = allocs.next(dst.to_reg().to_reg());
2239            let src1 = allocs.next(src1.to_reg());
2240            let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) {
2241                RegMem::Reg { reg } => {
2242                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
2243                }
2244                RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
2245            };
2246
2247            let (w, map, opcode) = match op {
2248                AvxOpcode::Vpinsrb => (false, OpcodeMap::_0F3A, 0x20),
2249                AvxOpcode::Vpinsrw => (false, OpcodeMap::_0F, 0xC4),
2250                AvxOpcode::Vpinsrd => (false, OpcodeMap::_0F3A, 0x22),
2251                AvxOpcode::Vpinsrq => (true, OpcodeMap::_0F3A, 0x22),
2252                _ => panic!("unexpected vex_pinsr opcode {op:?}"),
2253            };
2254
2255            VexInstruction::new()
2256                .length(VexVectorLength::V128)
2257                .prefix(LegacyPrefixes::_66)
2258                .map(map)
2259                .w(w)
2260                .opcode(opcode)
2261                .reg(dst.to_real_reg().unwrap().hw_enc())
2262                .vvvv(src1.to_real_reg().unwrap().hw_enc())
2263                .rm(src2)
2264                .imm(*imm)
2265                .encode(sink);
2266        }
2267
2268        Inst::XmmRmRVex3 {
2269            op,
2270            src1,
2271            src2,
2272            src3,
2273            dst,
2274        } => {
2275            let src1 = allocs.next(src1.to_reg());
2276            let dst = allocs.next(dst.to_reg().to_reg());
2277            debug_assert_eq!(src1, dst);
2278            let src2 = allocs.next(src2.to_reg());
2279            let src3 = match src3.clone().to_reg_mem().with_allocs(allocs) {
2280                RegMem::Reg { reg } => {
2281                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
2282                }
2283                RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
2284            };
2285
2286            let (w, map, opcode) = match op {
2287                AvxOpcode::Vfmadd132ss => (false, OpcodeMap::_0F38, 0x99),
2288                AvxOpcode::Vfmadd213ss => (false, OpcodeMap::_0F38, 0xA9),
2289                AvxOpcode::Vfnmadd132ss => (false, OpcodeMap::_0F38, 0x9D),
2290                AvxOpcode::Vfnmadd213ss => (false, OpcodeMap::_0F38, 0xAD),
2291                AvxOpcode::Vfmadd132sd => (true, OpcodeMap::_0F38, 0x99),
2292                AvxOpcode::Vfmadd213sd => (true, OpcodeMap::_0F38, 0xA9),
2293                AvxOpcode::Vfnmadd132sd => (true, OpcodeMap::_0F38, 0x9D),
2294                AvxOpcode::Vfnmadd213sd => (true, OpcodeMap::_0F38, 0xAD),
2295                AvxOpcode::Vfmadd132ps => (false, OpcodeMap::_0F38, 0x98),
2296                AvxOpcode::Vfmadd213ps => (false, OpcodeMap::_0F38, 0xA8),
2297                AvxOpcode::Vfnmadd132ps => (false, OpcodeMap::_0F38, 0x9C),
2298                AvxOpcode::Vfnmadd213ps => (false, OpcodeMap::_0F38, 0xAC),
2299                AvxOpcode::Vfmadd132pd => (true, OpcodeMap::_0F38, 0x98),
2300                AvxOpcode::Vfmadd213pd => (true, OpcodeMap::_0F38, 0xA8),
2301                AvxOpcode::Vfnmadd132pd => (true, OpcodeMap::_0F38, 0x9C),
2302                AvxOpcode::Vfnmadd213pd => (true, OpcodeMap::_0F38, 0xAC),
2303                AvxOpcode::Vblendvps => (false, OpcodeMap::_0F3A, 0x4A),
2304                AvxOpcode::Vblendvpd => (false, OpcodeMap::_0F3A, 0x4B),
2305                AvxOpcode::Vpblendvb => (false, OpcodeMap::_0F3A, 0x4C),
2306                _ => unreachable!(),
2307            };
2308
2309            VexInstruction::new()
2310                .length(VexVectorLength::V128)
2311                .prefix(LegacyPrefixes::_66)
2312                .map(map)
2313                .w(w)
2314                .opcode(opcode)
2315                .reg(dst.to_real_reg().unwrap().hw_enc())
2316                .rm(src3)
2317                .vvvv(src2.to_real_reg().unwrap().hw_enc())
2318                .encode(sink);
2319        }
2320
2321        Inst::XmmRmRBlendVex {
2322            op,
2323            src1,
2324            src2,
2325            mask,
2326            dst,
2327        } => {
2328            let dst = allocs.next(dst.to_reg().to_reg());
2329            let src1 = allocs.next(src1.to_reg());
2330            let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) {
2331                RegMem::Reg { reg } => {
2332                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
2333                }
2334                RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
2335            };
2336            let mask = allocs.next(mask.to_reg());
2337
2338            let opcode = match op {
2339                AvxOpcode::Vblendvps => 0x4A,
2340                AvxOpcode::Vblendvpd => 0x4B,
2341                AvxOpcode::Vpblendvb => 0x4C,
2342                _ => unreachable!(),
2343            };
2344
2345            VexInstruction::new()
2346                .length(VexVectorLength::V128)
2347                .prefix(LegacyPrefixes::_66)
2348                .map(OpcodeMap::_0F3A)
2349                .opcode(opcode)
2350                .reg(dst.to_real_reg().unwrap().hw_enc())
2351                .vvvv(src1.to_real_reg().unwrap().hw_enc())
2352                .rm(src2)
2353                .imm(mask.to_real_reg().unwrap().hw_enc() << 4)
2354                .encode(sink);
2355        }
2356
2357        Inst::XmmUnaryRmRVex { op, src, dst } => {
2358            let dst = allocs.next(dst.to_reg().to_reg());
2359            let src = match src.clone().to_reg_mem().with_allocs(allocs) {
2360                RegMem::Reg { reg } => {
2361                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
2362                }
2363                RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
2364            };
2365
2366            let (prefix, map, opcode) = match op {
2367                AvxOpcode::Vpmovsxbw => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x20),
2368                AvxOpcode::Vpmovzxbw => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x30),
2369                AvxOpcode::Vpmovsxwd => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x23),
2370                AvxOpcode::Vpmovzxwd => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x33),
2371                AvxOpcode::Vpmovsxdq => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x25),
2372                AvxOpcode::Vpmovzxdq => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x35),
2373                AvxOpcode::Vpabsb => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x1C),
2374                AvxOpcode::Vpabsw => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x1D),
2375                AvxOpcode::Vpabsd => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x1E),
2376                AvxOpcode::Vsqrtps => (LegacyPrefixes::None, OpcodeMap::_0F, 0x51),
2377                AvxOpcode::Vsqrtpd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x51),
2378                AvxOpcode::Vcvtdq2pd => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0xE6),
2379                AvxOpcode::Vcvtdq2ps => (LegacyPrefixes::None, OpcodeMap::_0F, 0x5B),
2380                AvxOpcode::Vcvtpd2ps => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x5A),
2381                AvxOpcode::Vcvtps2pd => (LegacyPrefixes::None, OpcodeMap::_0F, 0x5A),
2382                AvxOpcode::Vcvttpd2dq => (LegacyPrefixes::_66, OpcodeMap::_0F, 0xE6),
2383                AvxOpcode::Vcvttps2dq => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x5B),
2384                AvxOpcode::Vmovdqu => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x6F),
2385                AvxOpcode::Vmovups => (LegacyPrefixes::None, OpcodeMap::_0F, 0x10),
2386                AvxOpcode::Vmovupd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x10),
2387
2388                // Note that for `vmov{s,d}` the `inst.isle` rules should
2389                // statically ensure that only `Amode` operands are used here.
2390                // Otherwise the other encodings of `vmovss` are more like
2391                // 2-operand instructions which this unary encoding does not
2392                // have.
2393                AvxOpcode::Vmovss => match &src {
2394                    RegisterOrAmode::Amode(_) => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x10),
2395                    _ => unreachable!(),
2396                },
2397                AvxOpcode::Vmovsd => match &src {
2398                    RegisterOrAmode::Amode(_) => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x10),
2399                    _ => unreachable!(),
2400                },
2401
2402                AvxOpcode::Vpbroadcastb => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x78),
2403                AvxOpcode::Vpbroadcastw => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x79),
2404                AvxOpcode::Vpbroadcastd => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x58),
2405                AvxOpcode::Vbroadcastss => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x18),
2406                AvxOpcode::Vmovddup => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x12),
2407
2408                AvxOpcode::Vcvtss2sd => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x5A),
2409                AvxOpcode::Vcvtsd2ss => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x5A),
2410                AvxOpcode::Vsqrtss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x51),
2411                AvxOpcode::Vsqrtsd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x51),
2412
2413                _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
2414            };
2415
2416            let vex = VexInstruction::new()
2417                .length(VexVectorLength::V128)
2418                .prefix(prefix)
2419                .map(map)
2420                .opcode(opcode)
2421                .reg(dst.to_real_reg().unwrap().hw_enc())
2422                .rm(src);
2423
2424            // These opcodes take a second operand through `vvvv` which copies
2425            // the upper bits into the destination register. That's not
2426            // reflected in the CLIF instruction, however, since the SSE version
2427            // doesn't have this functionality. Instead just copy whatever
2428            // happens to already be in the destination, which at least is what
2429            // LLVM seems to do.
2430            let vex = match op {
2431                AvxOpcode::Vcvtss2sd
2432                | AvxOpcode::Vcvtsd2ss
2433                | AvxOpcode::Vsqrtss
2434                | AvxOpcode::Vsqrtsd => vex.vvvv(dst.to_real_reg().unwrap().hw_enc()),
2435                _ => vex,
2436            };
2437            vex.encode(sink);
2438        }
2439
2440        Inst::XmmUnaryRmRImmVex { op, src, dst, imm } => {
2441            let dst = allocs.next(dst.to_reg().to_reg());
2442            let src = match src.clone().to_reg_mem().with_allocs(allocs) {
2443                RegMem::Reg { reg } => {
2444                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
2445                }
2446                RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
2447            };
2448
2449            let (prefix, map, opcode) = match op {
2450                AvxOpcode::Vroundps => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x08),
2451                AvxOpcode::Vroundpd => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x09),
2452                AvxOpcode::Vpshuflw => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x70),
2453                AvxOpcode::Vpshufhw => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x70),
2454                AvxOpcode::Vpshufd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x70),
2455                AvxOpcode::Vroundss => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0A),
2456                AvxOpcode::Vroundsd => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0B),
2457                _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
2458            };
2459
2460            let vex = VexInstruction::new()
2461                .length(VexVectorLength::V128)
2462                .prefix(prefix)
2463                .map(map)
2464                .opcode(opcode)
2465                .reg(dst.to_real_reg().unwrap().hw_enc())
2466                .rm(src)
2467                .imm(*imm);
2468
2469            // See comments in similar block above in `XmmUnaryRmRVex` for what
2470            // this is doing.
2471            let vex = match op {
2472                AvxOpcode::Vroundss | AvxOpcode::Vroundsd => {
2473                    vex.vvvv(dst.to_real_reg().unwrap().hw_enc())
2474                }
2475                _ => vex,
2476            };
2477            vex.encode(sink);
2478        }
2479
2480        Inst::XmmMovRMVex { op, src, dst } => {
2481            let src = allocs.next(src.to_reg());
2482            let dst = dst.with_allocs(allocs).finalize(state, sink);
2483
2484            let (prefix, map, opcode) = match op {
2485                AvxOpcode::Vmovdqu => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x7F),
2486                AvxOpcode::Vmovss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x11),
2487                AvxOpcode::Vmovsd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x11),
2488                AvxOpcode::Vmovups => (LegacyPrefixes::None, OpcodeMap::_0F, 0x11),
2489                AvxOpcode::Vmovupd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x11),
2490                _ => unimplemented!("Opcode {:?} not implemented", op),
2491            };
2492            VexInstruction::new()
2493                .length(VexVectorLength::V128)
2494                .prefix(prefix)
2495                .map(map)
2496                .opcode(opcode)
2497                .rm(dst)
2498                .reg(src.to_real_reg().unwrap().hw_enc())
2499                .encode(sink);
2500        }
2501
2502        Inst::XmmMovRMImmVex { op, src, dst, imm } => {
2503            let src = allocs.next(src.to_reg());
2504            let dst = dst.with_allocs(allocs).finalize(state, sink);
2505
2506            let (w, prefix, map, opcode) = match op {
2507                AvxOpcode::Vpextrb => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x14),
2508                AvxOpcode::Vpextrw => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x15),
2509                AvxOpcode::Vpextrd => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16),
2510                AvxOpcode::Vpextrq => (true, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16),
2511                _ => unimplemented!("Opcode {:?} not implemented", op),
2512            };
2513            VexInstruction::new()
2514                .length(VexVectorLength::V128)
2515                .w(w)
2516                .prefix(prefix)
2517                .map(map)
2518                .opcode(opcode)
2519                .rm(dst)
2520                .reg(src.to_real_reg().unwrap().hw_enc())
2521                .imm(*imm)
2522                .encode(sink);
2523        }
2524
2525        Inst::XmmToGprImmVex { op, src, dst, imm } => {
2526            let src = allocs.next(src.to_reg());
2527            let dst = allocs.next(dst.to_reg().to_reg());
2528
2529            let (w, prefix, map, opcode) = match op {
2530                AvxOpcode::Vpextrb => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x14),
2531                AvxOpcode::Vpextrw => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x15),
2532                AvxOpcode::Vpextrd => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16),
2533                AvxOpcode::Vpextrq => (true, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16),
2534                _ => unimplemented!("Opcode {:?} not implemented", op),
2535            };
2536            VexInstruction::new()
2537                .length(VexVectorLength::V128)
2538                .w(w)
2539                .prefix(prefix)
2540                .map(map)
2541                .opcode(opcode)
2542                .rm(dst.to_real_reg().unwrap().hw_enc())
2543                .reg(src.to_real_reg().unwrap().hw_enc())
2544                .imm(*imm)
2545                .encode(sink);
2546        }
2547
2548        Inst::XmmToGprVex {
2549            op,
2550            src,
2551            dst,
2552            dst_size,
2553        } => {
2554            let src = allocs.next(src.to_reg());
2555            let dst = allocs.next(dst.to_reg().to_reg());
2556
2557            let (prefix, map, opcode) = match op {
2558                // vmovd/vmovq are differentiated by `w`
2559                AvxOpcode::Vmovd | AvxOpcode::Vmovq => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x7E),
2560                AvxOpcode::Vmovmskps => (LegacyPrefixes::None, OpcodeMap::_0F, 0x50),
2561                AvxOpcode::Vmovmskpd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x50),
2562                AvxOpcode::Vpmovmskb => (LegacyPrefixes::_66, OpcodeMap::_0F, 0xD7),
2563                _ => unimplemented!("Opcode {:?} not implemented", op),
2564            };
2565            let w = match dst_size {
2566                OperandSize::Size64 => true,
2567                _ => false,
2568            };
2569            let mut vex = VexInstruction::new()
2570                .length(VexVectorLength::V128)
2571                .w(w)
2572                .prefix(prefix)
2573                .map(map)
2574                .opcode(opcode);
2575            vex = match op {
2576                // The `vmovq/vmovd` reverse the order of the destination/source
2577                // relative to other opcodes using this shape of instruction.
2578                AvxOpcode::Vmovd | AvxOpcode::Vmovq => vex
2579                    .rm(dst.to_real_reg().unwrap().hw_enc())
2580                    .reg(src.to_real_reg().unwrap().hw_enc()),
2581                _ => vex
2582                    .rm(src.to_real_reg().unwrap().hw_enc())
2583                    .reg(dst.to_real_reg().unwrap().hw_enc()),
2584            };
2585            vex.encode(sink);
2586        }
2587
2588        Inst::GprToXmmVex {
2589            op,
2590            src,
2591            dst,
2592            src_size,
2593        } => {
2594            let dst = allocs.next(dst.to_reg().to_reg());
2595            let src = match src.clone().to_reg_mem().with_allocs(allocs) {
2596                RegMem::Reg { reg } => {
2597                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
2598                }
2599                RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
2600            };
2601
2602            let (prefix, map, opcode) = match op {
2603                // vmovd/vmovq are differentiated by `w`
2604                AvxOpcode::Vmovd | AvxOpcode::Vmovq => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x6E),
2605                AvxOpcode::Vcvtsi2ss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x2A),
2606                AvxOpcode::Vcvtsi2sd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x2A),
2607                _ => unimplemented!("Opcode {:?} not implemented", op),
2608            };
2609            let w = match src_size {
2610                OperandSize::Size64 => true,
2611                _ => false,
2612            };
2613            let mut insn = VexInstruction::new()
2614                .length(VexVectorLength::V128)
2615                .w(w)
2616                .prefix(prefix)
2617                .map(map)
2618                .opcode(opcode)
2619                .rm(src)
2620                .reg(dst.to_real_reg().unwrap().hw_enc());
2621            // These opcodes technically take a second operand which is the
2622            // upper bits to preserve during the float conversion. We don't
2623            // actually use this in this backend right now so reuse the
2624            // destination register. This at least matches what LLVM does.
2625            if let AvxOpcode::Vcvtsi2ss | AvxOpcode::Vcvtsi2sd = op {
2626                insn = insn.vvvv(dst.to_real_reg().unwrap().hw_enc());
2627            }
2628            insn.encode(sink);
2629        }
2630
2631        Inst::XmmRmREvex {
2632            op,
2633            src1,
2634            src2,
2635            dst,
2636        }
2637        | Inst::XmmRmREvex3 {
2638            op,
2639            src1,
2640            src2,
2641            dst,
2642            // `dst` reuses `src3`.
2643            ..
2644        } => {
2645            let dst = allocs.next(dst.to_reg().to_reg());
2646            let src2 = allocs.next(src2.to_reg());
2647            if let Inst::XmmRmREvex3 { src3, .. } = inst {
2648                let src3 = allocs.next(src3.to_reg());
2649                debug_assert_eq!(src3, dst);
2650            }
2651            let src1 = src1.clone().to_reg_mem().with_allocs(allocs);
2652
2653            let (w, opcode) = match op {
2654                Avx512Opcode::Vpermi2b => (false, 0x75),
2655                Avx512Opcode::Vpmullq => (true, 0x40),
2656                _ => unimplemented!("Opcode {:?} not implemented", op),
2657            };
2658            match src1 {
2659                RegMem::Reg { reg: src } => EvexInstruction::new()
2660                    .length(EvexVectorLength::V128)
2661                    .prefix(LegacyPrefixes::_66)
2662                    .map(OpcodeMap::_0F38)
2663                    .w(w)
2664                    .opcode(opcode)
2665                    .reg(dst.to_real_reg().unwrap().hw_enc())
2666                    .rm(src.to_real_reg().unwrap().hw_enc())
2667                    .vvvvv(src2.to_real_reg().unwrap().hw_enc())
2668                    .encode(sink),
2669                _ => todo!(),
2670            };
2671        }
2672
2673        Inst::XmmMinMaxSeq {
2674            size,
2675            is_min,
2676            lhs,
2677            rhs,
2678            dst,
2679        } => {
2680            let rhs = allocs.next(rhs.to_reg());
2681            let lhs = allocs.next(lhs.to_reg());
2682            let dst = allocs.next(dst.to_reg().to_reg());
2683            debug_assert_eq!(rhs, dst);
2684
2685            // Generates the following sequence:
2686            // cmpss/cmpsd %lhs, %rhs_dst
2687            // jnz do_min_max
2688            // jp propagate_nan
2689            //
2690            // ;; ordered and equal: propagate the sign bit (for -0 vs 0):
2691            // {and,or}{ss,sd} %lhs, %rhs_dst
2692            // j done
2693            //
2694            // ;; to get the desired NaN behavior (signalling NaN transformed into a quiet NaN, the
2695            // ;; NaN value is returned), we add both inputs.
2696            // propagate_nan:
2697            // add{ss,sd} %lhs, %rhs_dst
2698            // j done
2699            //
2700            // do_min_max:
2701            // {min,max}{ss,sd} %lhs, %rhs_dst
2702            //
2703            // done:
2704            let done = sink.get_label();
2705            let propagate_nan = sink.get_label();
2706            let do_min_max = sink.get_label();
2707
2708            let (add_op, cmp_op, and_op, or_op, min_max_op) = match size {
2709                OperandSize::Size32 => (
2710                    SseOpcode::Addss,
2711                    SseOpcode::Ucomiss,
2712                    SseOpcode::Andps,
2713                    SseOpcode::Orps,
2714                    if *is_min {
2715                        SseOpcode::Minss
2716                    } else {
2717                        SseOpcode::Maxss
2718                    },
2719                ),
2720                OperandSize::Size64 => (
2721                    SseOpcode::Addsd,
2722                    SseOpcode::Ucomisd,
2723                    SseOpcode::Andpd,
2724                    SseOpcode::Orpd,
2725                    if *is_min {
2726                        SseOpcode::Minsd
2727                    } else {
2728                        SseOpcode::Maxsd
2729                    },
2730                ),
2731                _ => unreachable!(),
2732            };
2733
2734            let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(lhs), dst);
2735            inst.emit(&[], sink, info, state);
2736
2737            one_way_jmp(sink, CC::NZ, do_min_max);
2738            one_way_jmp(sink, CC::P, propagate_nan);
2739
2740            // Ordered and equal. The operands are bit-identical unless they are zero
2741            // and negative zero. These instructions merge the sign bits in that
2742            // case, and are no-ops otherwise.
2743            let op = if *is_min { or_op } else { and_op };
2744            let inst = Inst::xmm_rm_r(op, RegMem::reg(lhs), Writable::from_reg(dst));
2745            inst.emit(&[], sink, info, state);
2746
2747            let inst = Inst::jmp_known(done);
2748            inst.emit(&[], sink, info, state);
2749
2750            // x86's min/max are not symmetric; if either operand is a NaN, they return the
2751            // read-only operand: perform an addition between the two operands, which has the
2752            // desired NaN propagation effects.
2753            sink.bind_label(propagate_nan);
2754            let inst = Inst::xmm_rm_r(add_op, RegMem::reg(lhs), Writable::from_reg(dst));
2755            inst.emit(&[], sink, info, state);
2756
2757            one_way_jmp(sink, CC::P, done);
2758
2759            sink.bind_label(do_min_max);
2760
2761            let inst = Inst::xmm_rm_r(min_max_op, RegMem::reg(lhs), Writable::from_reg(dst));
2762            inst.emit(&[], sink, info, state);
2763
2764            sink.bind_label(done);
2765        }
2766
2767        Inst::XmmRmRImm {
2768            op,
2769            src1,
2770            src2,
2771            dst,
2772            imm,
2773            size,
2774        } => {
2775            let src1 = allocs.next(*src1);
2776            let dst = allocs.next(dst.to_reg());
2777            let src2 = src2.with_allocs(allocs);
2778            debug_assert_eq!(src1, dst);
2779
2780            let (prefix, opcode, len) = match op {
2781                SseOpcode::Cmpps => (LegacyPrefixes::None, 0x0FC2, 2),
2782                SseOpcode::Cmppd => (LegacyPrefixes::_66, 0x0FC2, 2),
2783                SseOpcode::Cmpss => (LegacyPrefixes::_F3, 0x0FC2, 2),
2784                SseOpcode::Cmpsd => (LegacyPrefixes::_F2, 0x0FC2, 2),
2785                SseOpcode::Insertps => (LegacyPrefixes::_66, 0x0F3A21, 3),
2786                SseOpcode::Palignr => (LegacyPrefixes::_66, 0x0F3A0F, 3),
2787                SseOpcode::Pinsrb => (LegacyPrefixes::_66, 0x0F3A20, 3),
2788                SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2),
2789                SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3),
2790                SseOpcode::Shufps => (LegacyPrefixes::None, 0x0FC6, 2),
2791                SseOpcode::Pblendw => (LegacyPrefixes::_66, 0x0F3A0E, 3),
2792                _ => unimplemented!("Opcode {:?} not implemented", op),
2793            };
2794            let rex = RexFlags::from(*size);
2795            let regs_swapped = match *op {
2796                // These opcodes (and not the SSE2 version of PEXTRW) flip the operand
2797                // encoding: `dst` in ModRM's r/m, `src` in ModRM's reg field.
2798                SseOpcode::Pextrb | SseOpcode::Pextrd => true,
2799                // The rest of the opcodes have the customary encoding: `dst` in ModRM's reg,
2800                // `src` in ModRM's r/m field.
2801                _ => false,
2802            };
2803            match src2 {
2804                RegMem::Reg { reg } => {
2805                    if regs_swapped {
2806                        emit_std_reg_reg(sink, prefix, opcode, len, reg, dst, rex);
2807                    } else {
2808                        emit_std_reg_reg(sink, prefix, opcode, len, dst, reg, rex);
2809                    }
2810                }
2811                RegMem::Mem { addr } => {
2812                    let addr = &addr.finalize(state, sink);
2813                    assert!(
2814                        !regs_swapped,
2815                        "No existing way to encode a mem argument in the ModRM r/m field."
2816                    );
2817                    // N.B.: bytes_at_end == 1, because of the `imm` byte below.
2818                    emit_std_reg_mem(sink, prefix, opcode, len, dst, addr, rex, 1);
2819                }
2820            }
2821            sink.put1(*imm);
2822        }
2823
2824        Inst::XmmUninitializedValue { .. } => {
2825            // This instruction format only exists to declare a register as a `def`; no code is
2826            // emitted.
2827        }
2828
2829        Inst::XmmMovRM { op, src, dst } => {
2830            let src = allocs.next(src.to_reg());
2831            let dst = dst.with_allocs(allocs);
2832
2833            let (prefix, opcode) = match op {
2834                SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F29),
2835                SseOpcode::Movapd => (LegacyPrefixes::_66, 0x0F29),
2836                SseOpcode::Movdqu => (LegacyPrefixes::_F3, 0x0F7F),
2837                SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F11),
2838                SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F11),
2839                SseOpcode::Movups => (LegacyPrefixes::None, 0x0F11),
2840                SseOpcode::Movupd => (LegacyPrefixes::_66, 0x0F11),
2841                _ => unimplemented!("Opcode {:?} not implemented", op),
2842            };
2843            let dst = &dst.finalize(state, sink);
2844            emit_std_reg_mem(sink, prefix, opcode, 2, src, dst, RexFlags::clear_w(), 0);
2845        }
2846
2847        Inst::XmmMovRMImm { op, src, dst, imm } => {
2848            let src = allocs.next(src.to_reg());
2849            let dst = dst.with_allocs(allocs);
2850
2851            let (w, prefix, opcode) = match op {
2852                SseOpcode::Pextrb => (false, LegacyPrefixes::_66, 0x0F3A14),
2853                SseOpcode::Pextrw => (false, LegacyPrefixes::_66, 0x0F3A15),
2854                SseOpcode::Pextrd => (false, LegacyPrefixes::_66, 0x0F3A16),
2855                SseOpcode::Pextrq => (true, LegacyPrefixes::_66, 0x0F3A16),
2856                _ => unimplemented!("Opcode {:?} not implemented", op),
2857            };
2858            let rex = if w {
2859                RexFlags::set_w()
2860            } else {
2861                RexFlags::clear_w()
2862            };
2863            let dst = &dst.finalize(state, sink);
2864            emit_std_reg_mem(sink, prefix, opcode, 3, src, dst, rex, 1);
2865            sink.put1(*imm);
2866        }
2867
2868        Inst::XmmToGpr {
2869            op,
2870            src,
2871            dst,
2872            dst_size,
2873        } => {
2874            let src = allocs.next(src.to_reg());
2875            let dst = allocs.next(dst.to_reg().to_reg());
2876
2877            let (prefix, opcode, dst_first) = match op {
2878                SseOpcode::Cvttss2si => (LegacyPrefixes::_F3, 0x0F2C, true),
2879                SseOpcode::Cvttsd2si => (LegacyPrefixes::_F2, 0x0F2C, true),
2880                // Movd and movq use the same opcode; the presence of the REX prefix (set below)
2881                // actually determines which is used.
2882                SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F7E, false),
2883                SseOpcode::Movmskps => (LegacyPrefixes::None, 0x0F50, true),
2884                SseOpcode::Movmskpd => (LegacyPrefixes::_66, 0x0F50, true),
2885                SseOpcode::Pmovmskb => (LegacyPrefixes::_66, 0x0FD7, true),
2886                _ => panic!("unexpected opcode {:?}", op),
2887            };
2888            let rex = RexFlags::from(*dst_size);
2889            let (src, dst) = if dst_first { (dst, src) } else { (src, dst) };
2890
2891            emit_std_reg_reg(sink, prefix, opcode, 2, src, dst, rex);
2892        }
2893
2894        Inst::XmmToGprImm { op, src, dst, imm } => {
2895            use OperandSize as OS;
2896
2897            let src = allocs.next(src.to_reg());
2898            let dst = allocs.next(dst.to_reg().to_reg());
2899
2900            let (prefix, opcode, opcode_bytes, dst_size, dst_first) = match op {
2901                SseOpcode::Pextrb => (LegacyPrefixes::_66, 0x0F3A14, 3, OS::Size32, false),
2902                SseOpcode::Pextrw => (LegacyPrefixes::_66, 0x0FC5, 2, OS::Size32, true),
2903                SseOpcode::Pextrd => (LegacyPrefixes::_66, 0x0F3A16, 3, OS::Size32, false),
2904                SseOpcode::Pextrq => (LegacyPrefixes::_66, 0x0F3A16, 3, OS::Size64, false),
2905                _ => panic!("unexpected opcode {:?}", op),
2906            };
2907            let rex = RexFlags::from(dst_size);
2908            let (src, dst) = if dst_first { (dst, src) } else { (src, dst) };
2909
2910            emit_std_reg_reg(sink, prefix, opcode, opcode_bytes, src, dst, rex);
2911            sink.put1(*imm);
2912        }
2913
2914        Inst::GprToXmm {
2915            op,
2916            src: src_e,
2917            dst: reg_g,
2918            src_size,
2919        } => {
2920            let reg_g = allocs.next(reg_g.to_reg().to_reg());
2921            let src_e = src_e.clone().to_reg_mem().with_allocs(allocs);
2922
2923            let (prefix, opcode) = match op {
2924                // Movd and movq use the same opcode; the presence of the REX prefix (set below)
2925                // actually determines which is used.
2926                SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F6E),
2927                SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A),
2928                SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A),
2929                _ => panic!("unexpected opcode {:?}", op),
2930            };
2931            let rex = RexFlags::from(*src_size);
2932            match src_e {
2933                RegMem::Reg { reg: reg_e } => {
2934                    emit_std_reg_reg(sink, prefix, opcode, 2, reg_g, reg_e, rex);
2935                }
2936                RegMem::Mem { addr } => {
2937                    let addr = &addr.finalize(state, sink);
2938                    emit_std_reg_mem(sink, prefix, opcode, 2, reg_g, addr, rex, 0);
2939                }
2940            }
2941        }
2942
2943        Inst::XmmCmpRmR { op, src, dst } => {
2944            let dst = allocs.next(dst.to_reg());
2945            let src = src.clone().to_reg_mem().with_allocs(allocs);
2946
2947            let rex = RexFlags::clear_w();
2948            let (prefix, opcode, len) = match op {
2949                SseOpcode::Ptest => (LegacyPrefixes::_66, 0x0F3817, 3),
2950                SseOpcode::Ucomisd => (LegacyPrefixes::_66, 0x0F2E, 2),
2951                SseOpcode::Ucomiss => (LegacyPrefixes::None, 0x0F2E, 2),
2952                _ => unimplemented!("Emit xmm cmp rm r"),
2953            };
2954
2955            match src {
2956                RegMem::Reg { reg } => {
2957                    emit_std_reg_reg(sink, prefix, opcode, len, dst, reg, rex);
2958                }
2959                RegMem::Mem { addr } => {
2960                    let addr = &addr.finalize(state, sink);
2961                    emit_std_reg_mem(sink, prefix, opcode, len, dst, addr, rex, 0);
2962                }
2963            }
2964        }
2965
2966        Inst::CvtUint64ToFloatSeq {
2967            dst_size,
2968            src,
2969            dst,
2970            tmp_gpr1,
2971            tmp_gpr2,
2972        } => {
2973            let src = allocs.next(src.to_reg());
2974            let dst = allocs.next(dst.to_reg().to_reg());
2975            let tmp_gpr1 = allocs.next(tmp_gpr1.to_reg().to_reg());
2976            let tmp_gpr2 = allocs.next(tmp_gpr2.to_reg().to_reg());
2977
2978            // Note: this sequence is specific to 64-bit mode; a 32-bit mode would require a
2979            // different sequence.
2980            //
2981            // Emit the following sequence:
2982            //
2983            //  cmp 0, %src
2984            //  jl handle_negative
2985            //
2986            //  ;; handle positive, which can't overflow
2987            //  cvtsi2sd/cvtsi2ss %src, %dst
2988            //  j done
2989            //
2990            //  ;; handle negative: see below for an explanation of what it's doing.
2991            //  handle_negative:
2992            //  mov %src, %tmp_gpr1
2993            //  shr $1, %tmp_gpr1
2994            //  mov %src, %tmp_gpr2
2995            //  and $1, %tmp_gpr2
2996            //  or %tmp_gpr1, %tmp_gpr2
2997            //  cvtsi2sd/cvtsi2ss %tmp_gpr2, %dst
2998            //  addsd/addss %dst, %dst
2999            //
3000            //  done:
3001
3002            assert_ne!(src, tmp_gpr1);
3003            assert_ne!(src, tmp_gpr2);
3004            assert_ne!(tmp_gpr1, tmp_gpr2);
3005
3006            let handle_negative = sink.get_label();
3007            let done = sink.get_label();
3008
3009            // If x seen as a signed int64 is not negative, a signed-conversion will do the right
3010            // thing.
3011            // TODO use tst src, src here.
3012            let inst = Inst::cmp_rmi_r(OperandSize::Size64, RegMemImm::imm(0), src);
3013            inst.emit(&[], sink, info, state);
3014
3015            one_way_jmp(sink, CC::L, handle_negative);
3016
3017            // Handle a positive int64, which is the "easy" case: a signed conversion will do the
3018            // right thing.
3019            emit_signed_cvt(
3020                sink,
3021                info,
3022                state,
3023                src,
3024                Writable::from_reg(dst),
3025                *dst_size == OperandSize::Size64,
3026            );
3027
3028            let inst = Inst::jmp_known(done);
3029            inst.emit(&[], sink, info, state);
3030
3031            sink.bind_label(handle_negative);
3032
3033            // Divide x by two to get it in range for the signed conversion, keep the LSB, and
3034            // scale it back up on the FP side.
3035            let inst = Inst::gen_move(Writable::from_reg(tmp_gpr1), src, types::I64);
3036            inst.emit(&[], sink, info, state);
3037
3038            // tmp_gpr1 := src >> 1
3039            let inst = Inst::shift_r(
3040                OperandSize::Size64,
3041                ShiftKind::ShiftRightLogical,
3042                Imm8Gpr::new(Imm8Reg::Imm8 { imm: 1 }).unwrap(),
3043                tmp_gpr1,
3044                Writable::from_reg(tmp_gpr1),
3045            );
3046            inst.emit(&[], sink, info, state);
3047
3048            let inst = Inst::gen_move(Writable::from_reg(tmp_gpr2), src, types::I64);
3049            inst.emit(&[], sink, info, state);
3050
3051            let inst = Inst::alu_rmi_r(
3052                OperandSize::Size64,
3053                AluRmiROpcode::And,
3054                RegMemImm::imm(1),
3055                Writable::from_reg(tmp_gpr2),
3056            );
3057            inst.emit(&[], sink, info, state);
3058
3059            let inst = Inst::alu_rmi_r(
3060                OperandSize::Size64,
3061                AluRmiROpcode::Or,
3062                RegMemImm::reg(tmp_gpr1),
3063                Writable::from_reg(tmp_gpr2),
3064            );
3065            inst.emit(&[], sink, info, state);
3066
3067            emit_signed_cvt(
3068                sink,
3069                info,
3070                state,
3071                tmp_gpr2,
3072                Writable::from_reg(dst),
3073                *dst_size == OperandSize::Size64,
3074            );
3075
3076            let add_op = if *dst_size == OperandSize::Size64 {
3077                SseOpcode::Addsd
3078            } else {
3079                SseOpcode::Addss
3080            };
3081            let inst = Inst::xmm_rm_r(add_op, RegMem::reg(dst), Writable::from_reg(dst));
3082            inst.emit(&[], sink, info, state);
3083
3084            sink.bind_label(done);
3085        }
3086
3087        Inst::CvtFloatToSintSeq {
3088            src_size,
3089            dst_size,
3090            is_saturating,
3091            src,
3092            dst,
3093            tmp_gpr,
3094            tmp_xmm,
3095        } => {
3096            let src = allocs.next(src.to_reg());
3097            let dst = allocs.next(dst.to_reg().to_reg());
3098            let tmp_gpr = allocs.next(tmp_gpr.to_reg().to_reg());
3099            let tmp_xmm = allocs.next(tmp_xmm.to_reg().to_reg());
3100
3101            // Emits the following common sequence:
3102            //
3103            // cvttss2si/cvttsd2si %src, %dst
3104            // cmp %dst, 1
3105            // jno done
3106            //
3107            // Then, for saturating conversions:
3108            //
3109            // ;; check for NaN
3110            // cmpss/cmpsd %src, %src
3111            // jnp not_nan
3112            // xor %dst, %dst
3113            //
3114            // ;; positive inputs get saturated to INT_MAX; negative ones to INT_MIN, which is
3115            // ;; already in %dst.
3116            // xorpd %tmp_xmm, %tmp_xmm
3117            // cmpss/cmpsd %src, %tmp_xmm
3118            // jnb done
3119            // mov/movaps $INT_MAX, %dst
3120            //
3121            // done:
3122            //
3123            // Then, for non-saturating conversions:
3124            //
3125            // ;; check for NaN
3126            // cmpss/cmpsd %src, %src
3127            // jnp not_nan
3128            // ud2 trap BadConversionToInteger
3129            //
3130            // ;; check if INT_MIN was the correct result, against a magic constant:
3131            // not_nan:
3132            // movaps/mov $magic, %tmp_gpr
3133            // movq/movd %tmp_gpr, %tmp_xmm
3134            // cmpss/cmpsd %tmp_xmm, %src
3135            // jnb/jnbe $check_positive
3136            // ud2 trap IntegerOverflow
3137            //
3138            // ;; if positive, it was a real overflow
3139            // check_positive:
3140            // xorpd %tmp_xmm, %tmp_xmm
3141            // cmpss/cmpsd %src, %tmp_xmm
3142            // jnb done
3143            // ud2 trap IntegerOverflow
3144            //
3145            // done:
3146
3147            let (cast_op, cmp_op, trunc_op) = match src_size {
3148                OperandSize::Size64 => (SseOpcode::Movq, SseOpcode::Ucomisd, SseOpcode::Cvttsd2si),
3149                OperandSize::Size32 => (SseOpcode::Movd, SseOpcode::Ucomiss, SseOpcode::Cvttss2si),
3150                _ => unreachable!(),
3151            };
3152
3153            let done = sink.get_label();
3154
3155            // The truncation.
3156            let inst = Inst::xmm_to_gpr(trunc_op, src, Writable::from_reg(dst), *dst_size);
3157            inst.emit(&[], sink, info, state);
3158
3159            // Compare against 1, in case of overflow the dst operand was INT_MIN.
3160            let inst = Inst::cmp_rmi_r(*dst_size, RegMemImm::imm(1), dst);
3161            inst.emit(&[], sink, info, state);
3162
3163            one_way_jmp(sink, CC::NO, done); // no overflow => done
3164
3165            // Check for NaN.
3166
3167            let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), src);
3168            inst.emit(&[], sink, info, state);
3169
3170            if *is_saturating {
3171                let not_nan = sink.get_label();
3172                one_way_jmp(sink, CC::NP, not_nan); // go to not_nan if not a NaN
3173
3174                // For NaN, emit 0.
3175                let inst = Inst::alu_rmi_r(
3176                    *dst_size,
3177                    AluRmiROpcode::Xor,
3178                    RegMemImm::reg(dst),
3179                    Writable::from_reg(dst),
3180                );
3181                inst.emit(&[], sink, info, state);
3182
3183                let inst = Inst::jmp_known(done);
3184                inst.emit(&[], sink, info, state);
3185
3186                sink.bind_label(not_nan);
3187
3188                // If the input was positive, saturate to INT_MAX.
3189
3190                // Zero out tmp_xmm.
3191                let inst = Inst::xmm_rm_r(
3192                    SseOpcode::Xorpd,
3193                    RegMem::reg(tmp_xmm),
3194                    Writable::from_reg(tmp_xmm),
3195                );
3196                inst.emit(&[], sink, info, state);
3197
3198                let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm);
3199                inst.emit(&[], sink, info, state);
3200
3201                // Jump if >= to done.
3202                one_way_jmp(sink, CC::NB, done);
3203
3204                // Otherwise, put INT_MAX.
3205                if *dst_size == OperandSize::Size64 {
3206                    let inst = Inst::imm(
3207                        OperandSize::Size64,
3208                        0x7fffffffffffffff,
3209                        Writable::from_reg(dst),
3210                    );
3211                    inst.emit(&[], sink, info, state);
3212                } else {
3213                    let inst = Inst::imm(OperandSize::Size32, 0x7fffffff, Writable::from_reg(dst));
3214                    inst.emit(&[], sink, info, state);
3215                }
3216            } else {
3217                let inst = Inst::trap_if(CC::P, TrapCode::BadConversionToInteger);
3218                inst.emit(&[], sink, info, state);
3219
3220                // Check if INT_MIN was the correct result: determine the smallest floating point
3221                // number that would convert to INT_MIN, put it in a temporary register, and compare
3222                // against the src register.
3223                // If the src register is less (or in some cases, less-or-equal) than the threshold,
3224                // trap!
3225
3226                let mut no_overflow_cc = CC::NB; // >=
3227                let output_bits = dst_size.to_bits();
3228                match *src_size {
3229                    OperandSize::Size32 => {
3230                        let cst = Ieee32::pow2(output_bits - 1).neg().bits();
3231                        let inst =
3232                            Inst::imm(OperandSize::Size32, cst as u64, Writable::from_reg(tmp_gpr));
3233                        inst.emit(&[], sink, info, state);
3234                    }
3235                    OperandSize::Size64 => {
3236                        // An f64 can represent `i32::min_value() - 1` exactly with precision to spare,
3237                        // so there are values less than -2^(N-1) that convert correctly to INT_MIN.
3238                        let cst = if output_bits < 64 {
3239                            no_overflow_cc = CC::NBE; // >
3240                            Ieee64::fcvt_to_sint_negative_overflow(output_bits)
3241                        } else {
3242                            Ieee64::pow2(output_bits - 1).neg()
3243                        };
3244                        let inst =
3245                            Inst::imm(OperandSize::Size64, cst.bits(), Writable::from_reg(tmp_gpr));
3246                        inst.emit(&[], sink, info, state);
3247                    }
3248                    _ => unreachable!(),
3249                }
3250
3251                let inst = Inst::gpr_to_xmm(
3252                    cast_op,
3253                    RegMem::reg(tmp_gpr),
3254                    *src_size,
3255                    Writable::from_reg(tmp_xmm),
3256                );
3257                inst.emit(&[], sink, info, state);
3258
3259                let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(tmp_xmm), src);
3260                inst.emit(&[], sink, info, state);
3261
3262                // no trap if src >= or > threshold
3263                let inst = Inst::trap_if(no_overflow_cc.invert(), TrapCode::IntegerOverflow);
3264                inst.emit(&[], sink, info, state);
3265
3266                // If positive, it was a real overflow.
3267
3268                // Zero out the tmp_xmm register.
3269                let inst = Inst::xmm_rm_r(
3270                    SseOpcode::Xorpd,
3271                    RegMem::reg(tmp_xmm),
3272                    Writable::from_reg(tmp_xmm),
3273                );
3274                inst.emit(&[], sink, info, state);
3275
3276                let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm);
3277                inst.emit(&[], sink, info, state);
3278
3279                // no trap if 0 >= src
3280                let inst = Inst::trap_if(CC::B, TrapCode::IntegerOverflow);
3281                inst.emit(&[], sink, info, state);
3282            }
3283
3284            sink.bind_label(done);
3285        }
3286
3287        Inst::CvtFloatToUintSeq {
3288            src_size,
3289            dst_size,
3290            is_saturating,
3291            src,
3292            dst,
3293            tmp_gpr,
3294            tmp_xmm,
3295            tmp_xmm2,
3296        } => {
3297            let src = allocs.next(src.to_reg());
3298            let dst = allocs.next(dst.to_reg().to_reg());
3299            let tmp_gpr = allocs.next(tmp_gpr.to_reg().to_reg());
3300            let tmp_xmm = allocs.next(tmp_xmm.to_reg().to_reg());
3301            let tmp_xmm2 = allocs.next(tmp_xmm2.to_reg().to_reg());
3302
3303            // The only difference in behavior between saturating and non-saturating is how we
3304            // handle errors. Emits the following sequence:
3305            //
3306            // movaps/mov 2**(int_width - 1), %tmp_gpr
3307            // movq/movd %tmp_gpr, %tmp_xmm
3308            // cmpss/cmpsd %tmp_xmm, %src
3309            // jnb is_large
3310            //
3311            // ;; check for NaN inputs
3312            // jnp not_nan
3313            // -- non-saturating: ud2 trap BadConversionToInteger
3314            // -- saturating: xor %dst, %dst; j done
3315            //
3316            // not_nan:
3317            // cvttss2si/cvttsd2si %src, %dst
3318            // cmp 0, %dst
3319            // jnl done
3320            // -- non-saturating: ud2 trap IntegerOverflow
3321            // -- saturating: xor %dst, %dst; j done
3322            //
3323            // is_large:
3324            // mov %src, %tmp_xmm2
3325            // subss/subsd %tmp_xmm, %tmp_xmm2
3326            // cvttss2si/cvttss2sd %tmp_x, %dst
3327            // cmp 0, %dst
3328            // jnl next_is_large
3329            // -- non-saturating: ud2 trap IntegerOverflow
3330            // -- saturating: movaps $UINT_MAX, %dst; j done
3331            //
3332            // next_is_large:
3333            // add 2**(int_width -1), %dst ;; 2 instructions for 64-bits integers
3334            //
3335            // done:
3336
3337            assert_ne!(tmp_xmm, src, "tmp_xmm clobbers src!");
3338
3339            let (sub_op, cast_op, cmp_op, trunc_op) = match src_size {
3340                OperandSize::Size32 => (
3341                    SseOpcode::Subss,
3342                    SseOpcode::Movd,
3343                    SseOpcode::Ucomiss,
3344                    SseOpcode::Cvttss2si,
3345                ),
3346                OperandSize::Size64 => (
3347                    SseOpcode::Subsd,
3348                    SseOpcode::Movq,
3349                    SseOpcode::Ucomisd,
3350                    SseOpcode::Cvttsd2si,
3351                ),
3352                _ => unreachable!(),
3353            };
3354
3355            let done = sink.get_label();
3356
3357            let cst = match src_size {
3358                OperandSize::Size32 => Ieee32::pow2(dst_size.to_bits() - 1).bits() as u64,
3359                OperandSize::Size64 => Ieee64::pow2(dst_size.to_bits() - 1).bits(),
3360                _ => unreachable!(),
3361            };
3362
3363            let inst = Inst::imm(*src_size, cst, Writable::from_reg(tmp_gpr));
3364            inst.emit(&[], sink, info, state);
3365
3366            let inst = Inst::gpr_to_xmm(
3367                cast_op,
3368                RegMem::reg(tmp_gpr),
3369                *src_size,
3370                Writable::from_reg(tmp_xmm),
3371            );
3372            inst.emit(&[], sink, info, state);
3373
3374            let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(tmp_xmm), src);
3375            inst.emit(&[], sink, info, state);
3376
3377            let handle_large = sink.get_label();
3378            one_way_jmp(sink, CC::NB, handle_large); // jump to handle_large if src >= large_threshold
3379
3380            if *is_saturating {
3381                // If not NaN jump over this 0-return, otherwise return 0
3382                let not_nan = sink.get_label();
3383                one_way_jmp(sink, CC::NP, not_nan);
3384                let inst = Inst::alu_rmi_r(
3385                    *dst_size,
3386                    AluRmiROpcode::Xor,
3387                    RegMemImm::reg(dst),
3388                    Writable::from_reg(dst),
3389                );
3390                inst.emit(&[], sink, info, state);
3391
3392                let inst = Inst::jmp_known(done);
3393                inst.emit(&[], sink, info, state);
3394                sink.bind_label(not_nan);
3395            } else {
3396                // Trap.
3397                let inst = Inst::trap_if(CC::P, TrapCode::BadConversionToInteger);
3398                inst.emit(&[], sink, info, state);
3399            }
3400
3401            // Actual truncation for small inputs: if the result is not positive, then we had an
3402            // overflow.
3403
3404            let inst = Inst::xmm_to_gpr(trunc_op, src, Writable::from_reg(dst), *dst_size);
3405            inst.emit(&[], sink, info, state);
3406
3407            let inst = Inst::cmp_rmi_r(*dst_size, RegMemImm::imm(0), dst);
3408            inst.emit(&[], sink, info, state);
3409
3410            one_way_jmp(sink, CC::NL, done); // if dst >= 0, jump to done
3411
3412            if *is_saturating {
3413                // The input was "small" (< 2**(width -1)), so the only way to get an integer
3414                // overflow is because the input was too small: saturate to the min value, i.e. 0.
3415                let inst = Inst::alu_rmi_r(
3416                    *dst_size,
3417                    AluRmiROpcode::Xor,
3418                    RegMemImm::reg(dst),
3419                    Writable::from_reg(dst),
3420                );
3421                inst.emit(&[], sink, info, state);
3422
3423                let inst = Inst::jmp_known(done);
3424                inst.emit(&[], sink, info, state);
3425            } else {
3426                // Trap.
3427                let inst = Inst::trap(TrapCode::IntegerOverflow);
3428                inst.emit(&[], sink, info, state);
3429            }
3430
3431            // Now handle large inputs.
3432
3433            sink.bind_label(handle_large);
3434
3435            let inst = Inst::gen_move(Writable::from_reg(tmp_xmm2), src, types::F64);
3436            inst.emit(&[], sink, info, state);
3437
3438            let inst = Inst::xmm_rm_r(sub_op, RegMem::reg(tmp_xmm), Writable::from_reg(tmp_xmm2));
3439            inst.emit(&[], sink, info, state);
3440
3441            let inst = Inst::xmm_to_gpr(trunc_op, tmp_xmm2, Writable::from_reg(dst), *dst_size);
3442            inst.emit(&[], sink, info, state);
3443
3444            let inst = Inst::cmp_rmi_r(*dst_size, RegMemImm::imm(0), dst);
3445            inst.emit(&[], sink, info, state);
3446
3447            if *is_saturating {
3448                let next_is_large = sink.get_label();
3449                one_way_jmp(sink, CC::NL, next_is_large); // if dst >= 0, jump to next_is_large
3450
3451                // The input was "large" (>= 2**(width -1)), so the only way to get an integer
3452                // overflow is because the input was too large: saturate to the max value.
3453                let inst = Inst::imm(
3454                    OperandSize::Size64,
3455                    if *dst_size == OperandSize::Size64 {
3456                        u64::max_value()
3457                    } else {
3458                        u32::max_value() as u64
3459                    },
3460                    Writable::from_reg(dst),
3461                );
3462                inst.emit(&[], sink, info, state);
3463
3464                let inst = Inst::jmp_known(done);
3465                inst.emit(&[], sink, info, state);
3466                sink.bind_label(next_is_large);
3467            } else {
3468                let inst = Inst::trap_if(CC::L, TrapCode::IntegerOverflow);
3469                inst.emit(&[], sink, info, state);
3470            }
3471
3472            if *dst_size == OperandSize::Size64 {
3473                let inst = Inst::imm(OperandSize::Size64, 1 << 63, Writable::from_reg(tmp_gpr));
3474                inst.emit(&[], sink, info, state);
3475
3476                let inst = Inst::alu_rmi_r(
3477                    OperandSize::Size64,
3478                    AluRmiROpcode::Add,
3479                    RegMemImm::reg(tmp_gpr),
3480                    Writable::from_reg(dst),
3481                );
3482                inst.emit(&[], sink, info, state);
3483            } else {
3484                let inst = Inst::alu_rmi_r(
3485                    OperandSize::Size32,
3486                    AluRmiROpcode::Add,
3487                    RegMemImm::imm(1 << 31),
3488                    Writable::from_reg(dst),
3489                );
3490                inst.emit(&[], sink, info, state);
3491            }
3492
3493            sink.bind_label(done);
3494        }
3495
3496        Inst::LoadExtName { dst, name, offset } => {
3497            let dst = allocs.next(dst.to_reg());
3498
3499            if info.flags.is_pic() {
3500                // Generates: movq symbol@GOTPCREL(%rip), %dst
3501                let enc_dst = int_reg_enc(dst);
3502                sink.put1(0x48 | ((enc_dst >> 3) & 1) << 2);
3503                sink.put1(0x8B);
3504                sink.put1(0x05 | ((enc_dst & 7) << 3));
3505                emit_reloc(sink, Reloc::X86GOTPCRel4, name, -4);
3506                sink.put4(0);
3507                // Offset in the relocation above applies to the address of the *GOT entry*, not
3508                // the loaded address; so we emit a separate add or sub instruction if needed.
3509                if *offset < 0 {
3510                    assert!(*offset >= -i32::MAX as i64);
3511                    sink.put1(0x48 | ((enc_dst >> 3) & 1));
3512                    sink.put1(0x81);
3513                    sink.put1(0xe8 | (enc_dst & 7));
3514                    sink.put4((-*offset) as u32);
3515                } else if *offset > 0 {
3516                    assert!(*offset <= i32::MAX as i64);
3517                    sink.put1(0x48 | ((enc_dst >> 3) & 1));
3518                    sink.put1(0x81);
3519                    sink.put1(0xc0 | (enc_dst & 7));
3520                    sink.put4(*offset as u32);
3521                }
3522            } else {
3523                // The full address can be encoded in the register, with a relocation.
3524                // Generates: movabsq $name, %dst
3525                let enc_dst = int_reg_enc(dst);
3526                sink.put1(0x48 | ((enc_dst >> 3) & 1));
3527                sink.put1(0xB8 | (enc_dst & 7));
3528                emit_reloc(sink, Reloc::Abs8, name, *offset);
3529                sink.put8(0);
3530            }
3531        }
3532
3533        Inst::LockCmpxchg {
3534            ty,
3535            replacement,
3536            expected,
3537            mem,
3538            dst_old,
3539        } => {
3540            let replacement = allocs.next(*replacement);
3541            let expected = allocs.next(*expected);
3542            let dst_old = allocs.next(dst_old.to_reg());
3543            let mem = mem.with_allocs(allocs);
3544
3545            debug_assert_eq!(expected, regs::rax());
3546            debug_assert_eq!(dst_old, regs::rax());
3547
3548            // lock cmpxchg{b,w,l,q} %replacement, (mem)
3549            // Note that 0xF0 is the Lock prefix.
3550            let (prefix, opcodes) = match *ty {
3551                types::I8 => (LegacyPrefixes::_F0, 0x0FB0),
3552                types::I16 => (LegacyPrefixes::_66F0, 0x0FB1),
3553                types::I32 => (LegacyPrefixes::_F0, 0x0FB1),
3554                types::I64 => (LegacyPrefixes::_F0, 0x0FB1),
3555                _ => unreachable!(),
3556            };
3557            let rex = RexFlags::from((OperandSize::from_ty(*ty), replacement));
3558            let amode = mem.finalize(state, sink);
3559            emit_std_reg_mem(sink, prefix, opcodes, 2, replacement, &amode, rex, 0);
3560        }
3561
3562        Inst::AtomicRmwSeq {
3563            ty,
3564            op,
3565            mem,
3566            operand,
3567            temp,
3568            dst_old,
3569        } => {
3570            let operand = allocs.next(*operand);
3571            let temp = allocs.next_writable(*temp);
3572            let dst_old = allocs.next_writable(*dst_old);
3573            debug_assert_eq!(dst_old.to_reg(), regs::rax());
3574            let mem = mem.finalize(state, sink).with_allocs(allocs);
3575
3576            // Emit this:
3577            //    mov{zbq,zwq,zlq,q}     (%r_address), %rax    // rax = old value
3578            //  again:
3579            //    movq                   %rax, %r_temp         // rax = old value, r_temp = old value
3580            //    `op`q                  %r_operand, %r_temp   // rax = old value, r_temp = new value
3581            //    lock cmpxchg{b,w,l,q}  %r_temp, (%r_address) // try to store new value
3582            //    jnz again // If this is taken, rax will have a "revised" old value
3583            //
3584            // Operand conventions: IN:  %r_address, %r_operand OUT: %rax (old
3585            //    value), %r_temp (trashed), %rflags (trashed)
3586            //
3587            // In the case where the operation is 'xchg', the "`op`q"
3588            // instruction is instead: movq                    %r_operand,
3589            //   %r_temp so that we simply write in the destination, the "2nd
3590            // arg for `op`".
3591            //
3592            // TODO: this sequence can be significantly improved (e.g., to `lock
3593            // <op>`) when it is known that `dst_old` is not used later, see
3594            // https://github.com/bytecodealliance/wasmtime/issues/2153.
3595            let again_label = sink.get_label();
3596
3597            // mov{zbq,zwq,zlq,q} (%r_address), %rax
3598            // No need to call `add_trap` here, since the `i1` emit will do that.
3599            let i1 = Inst::load(*ty, mem.clone(), dst_old, ExtKind::ZeroExtend);
3600            i1.emit(&[], sink, info, state);
3601
3602            // again:
3603            sink.bind_label(again_label);
3604
3605            // movq %rax, %r_temp
3606            let i2 = Inst::mov_r_r(OperandSize::Size64, dst_old.to_reg(), temp);
3607            i2.emit(&[], sink, info, state);
3608
3609            let operand_rmi = RegMemImm::reg(operand);
3610            use inst_common::MachAtomicRmwOp as RmwOp;
3611            match op {
3612                RmwOp::Xchg => {
3613                    // movq %r_operand, %r_temp
3614                    let i3 = Inst::mov_r_r(OperandSize::Size64, operand, temp);
3615                    i3.emit(&[], sink, info, state);
3616                }
3617                RmwOp::Nand => {
3618                    // andq %r_operand, %r_temp
3619                    let i3 =
3620                        Inst::alu_rmi_r(OperandSize::Size64, AluRmiROpcode::And, operand_rmi, temp);
3621                    i3.emit(&[], sink, info, state);
3622
3623                    // notq %r_temp
3624                    let i4 = Inst::not(OperandSize::Size64, temp);
3625                    i4.emit(&[], sink, info, state);
3626                }
3627                RmwOp::Umin | RmwOp::Umax | RmwOp::Smin | RmwOp::Smax => {
3628                    // cmp %r_temp, %r_operand
3629                    let i3 = Inst::cmp_rmi_r(
3630                        OperandSize::from_ty(*ty),
3631                        RegMemImm::reg(temp.to_reg()),
3632                        operand,
3633                    );
3634                    i3.emit(&[], sink, info, state);
3635
3636                    // cmovcc %r_operand, %r_temp
3637                    let cc = match op {
3638                        RmwOp::Umin => CC::BE,
3639                        RmwOp::Umax => CC::NB,
3640                        RmwOp::Smin => CC::LE,
3641                        RmwOp::Smax => CC::NL,
3642                        _ => unreachable!(),
3643                    };
3644                    let i4 = Inst::cmove(OperandSize::Size64, cc, RegMem::reg(operand), temp);
3645                    i4.emit(&[], sink, info, state);
3646                }
3647                _ => {
3648                    // opq %r_operand, %r_temp
3649                    let alu_op = match op {
3650                        RmwOp::Add => AluRmiROpcode::Add,
3651                        RmwOp::Sub => AluRmiROpcode::Sub,
3652                        RmwOp::And => AluRmiROpcode::And,
3653                        RmwOp::Or => AluRmiROpcode::Or,
3654                        RmwOp::Xor => AluRmiROpcode::Xor,
3655                        RmwOp::Xchg
3656                        | RmwOp::Nand
3657                        | RmwOp::Umin
3658                        | RmwOp::Umax
3659                        | RmwOp::Smin
3660                        | RmwOp::Smax => unreachable!(),
3661                    };
3662                    let i3 = Inst::alu_rmi_r(OperandSize::Size64, alu_op, operand_rmi, temp);
3663                    i3.emit(&[], sink, info, state);
3664                }
3665            }
3666
3667            // lock cmpxchg{b,w,l,q} %r_temp, (%r_address)
3668            // No need to call `add_trap` here, since the `i4` emit will do that.
3669            let i4 = Inst::LockCmpxchg {
3670                ty: *ty,
3671                replacement: temp.to_reg(),
3672                expected: dst_old.to_reg(),
3673                mem: mem.into(),
3674                dst_old,
3675            };
3676            i4.emit(&[], sink, info, state);
3677
3678            // jnz again
3679            one_way_jmp(sink, CC::NZ, again_label);
3680        }
3681
3682        Inst::Fence { kind } => {
3683            sink.put1(0x0F);
3684            sink.put1(0xAE);
3685            match kind {
3686                FenceKind::MFence => sink.put1(0xF0), // mfence = 0F AE F0
3687                FenceKind::LFence => sink.put1(0xE8), // lfence = 0F AE E8
3688                FenceKind::SFence => sink.put1(0xF8), // sfence = 0F AE F8
3689            }
3690        }
3691
3692        Inst::Hlt => {
3693            sink.put1(0xcc);
3694        }
3695
3696        Inst::Ud2 { trap_code } => {
3697            sink.add_trap(*trap_code);
3698            if let Some(s) = state.take_stack_map() {
3699                sink.add_stack_map(StackMapExtent::UpcomingBytes(2), s);
3700            }
3701            sink.put_data(Inst::TRAP_OPCODE);
3702        }
3703
3704        Inst::VirtualSPOffsetAdj { offset } => {
3705            trace!(
3706                "virtual sp offset adjusted by {} -> {}",
3707                offset,
3708                state.virtual_sp_offset + offset
3709            );
3710            state.virtual_sp_offset += offset;
3711        }
3712
3713        Inst::Nop { len } => {
3714            // These encodings can all be found in Intel's architecture manual, at the NOP
3715            // instruction description.
3716            let mut len = *len;
3717            while len != 0 {
3718                let emitted = u8::min(len, 9);
3719                match emitted {
3720                    0 => {}
3721                    1 => sink.put1(0x90), // NOP
3722                    2 => {
3723                        // 66 NOP
3724                        sink.put1(0x66);
3725                        sink.put1(0x90);
3726                    }
3727                    3 => {
3728                        // NOP [EAX]
3729                        sink.put1(0x0F);
3730                        sink.put1(0x1F);
3731                        sink.put1(0x00);
3732                    }
3733                    4 => {
3734                        // NOP 0(EAX), with 0 a 1-byte immediate.
3735                        sink.put1(0x0F);
3736                        sink.put1(0x1F);
3737                        sink.put1(0x40);
3738                        sink.put1(0x00);
3739                    }
3740                    5 => {
3741                        // NOP [EAX, EAX, 1]
3742                        sink.put1(0x0F);
3743                        sink.put1(0x1F);
3744                        sink.put1(0x44);
3745                        sink.put1(0x00);
3746                        sink.put1(0x00);
3747                    }
3748                    6 => {
3749                        // 66 NOP [EAX, EAX, 1]
3750                        sink.put1(0x66);
3751                        sink.put1(0x0F);
3752                        sink.put1(0x1F);
3753                        sink.put1(0x44);
3754                        sink.put1(0x00);
3755                        sink.put1(0x00);
3756                    }
3757                    7 => {
3758                        // NOP 0[EAX], but 0 is a 4 bytes immediate.
3759                        sink.put1(0x0F);
3760                        sink.put1(0x1F);
3761                        sink.put1(0x80);
3762                        sink.put1(0x00);
3763                        sink.put1(0x00);
3764                        sink.put1(0x00);
3765                        sink.put1(0x00);
3766                    }
3767                    8 => {
3768                        // NOP 0[EAX, EAX, 1], with 0 a 4 bytes immediate.
3769                        sink.put1(0x0F);
3770                        sink.put1(0x1F);
3771                        sink.put1(0x84);
3772                        sink.put1(0x00);
3773                        sink.put1(0x00);
3774                        sink.put1(0x00);
3775                        sink.put1(0x00);
3776                        sink.put1(0x00);
3777                    }
3778                    9 => {
3779                        // 66 NOP 0[EAX, EAX, 1], with 0 a 4 bytes immediate.
3780                        sink.put1(0x66);
3781                        sink.put1(0x0F);
3782                        sink.put1(0x1F);
3783                        sink.put1(0x84);
3784                        sink.put1(0x00);
3785                        sink.put1(0x00);
3786                        sink.put1(0x00);
3787                        sink.put1(0x00);
3788                        sink.put1(0x00);
3789                    }
3790                    _ => unreachable!(),
3791                }
3792                len -= emitted;
3793            }
3794        }
3795
3796        Inst::ElfTlsGetAddr { ref symbol, dst } => {
3797            let dst = allocs.next(dst.to_reg().to_reg());
3798            debug_assert_eq!(dst, regs::rax());
3799
3800            // N.B.: Must be exactly this byte sequence; the linker requires it,
3801            // because it must know how to rewrite the bytes.
3802
3803            // data16 lea gv@tlsgd(%rip),%rdi
3804            sink.put1(0x66); // data16
3805            sink.put1(0b01001000); // REX.W
3806            sink.put1(0x8d); // LEA
3807            sink.put1(0x3d); // ModRM byte
3808            emit_reloc(sink, Reloc::ElfX86_64TlsGd, symbol, -4);
3809            sink.put4(0); // offset
3810
3811            // data16 data16 callq __tls_get_addr-4
3812            sink.put1(0x66); // data16
3813            sink.put1(0x66); // data16
3814            sink.put1(0b01001000); // REX.W
3815            sink.put1(0xe8); // CALL
3816            emit_reloc(
3817                sink,
3818                Reloc::X86CallPLTRel4,
3819                &ExternalName::LibCall(LibCall::ElfTlsGetAddr),
3820                -4,
3821            );
3822            sink.put4(0); // offset
3823        }
3824
3825        Inst::MachOTlsGetAddr { ref symbol, dst } => {
3826            let dst = allocs.next(dst.to_reg().to_reg());
3827            debug_assert_eq!(dst, regs::rax());
3828
3829            // movq gv@tlv(%rip), %rdi
3830            sink.put1(0x48); // REX.w
3831            sink.put1(0x8b); // MOV
3832            sink.put1(0x3d); // ModRM byte
3833            emit_reloc(sink, Reloc::MachOX86_64Tlv, symbol, -4);
3834            sink.put4(0); // offset
3835
3836            // callq *(%rdi)
3837            sink.put1(0xff);
3838            sink.put1(0x17);
3839        }
3840
3841        Inst::CoffTlsGetAddr {
3842            ref symbol,
3843            dst,
3844            tmp,
3845        } => {
3846            let dst = allocs.next(dst.to_reg().to_reg());
3847            debug_assert_eq!(dst, regs::rax());
3848
3849            // tmp is used below directly as %rcx
3850            let tmp = allocs.next(tmp.to_reg().to_reg());
3851            debug_assert_eq!(tmp, regs::rcx());
3852
3853            // See: https://gcc.godbolt.org/z/M8or9x6ss
3854            // And: https://github.com/bjorn3/rustc_codegen_cranelift/issues/388#issuecomment-532930282
3855
3856            // Emit the following sequence
3857            // movl	(%rip), %eax          ; IMAGE_REL_AMD64_REL32	_tls_index
3858            // movq	%gs:88, %rcx
3859            // movq	(%rcx,%rax,8), %rax
3860            // leaq	(%rax), %rax          ; Reloc: IMAGE_REL_AMD64_SECREL	symbol
3861
3862            // Load TLS index for current thread
3863            // movl	(%rip), %eax
3864            sink.put1(0x8b); // mov
3865            sink.put1(0x05);
3866            emit_reloc(
3867                sink,
3868                Reloc::X86PCRel4,
3869                &ExternalName::KnownSymbol(KnownSymbol::CoffTlsIndex),
3870                -4,
3871            );
3872            sink.put4(0); // offset
3873
3874            // movq	%gs:88, %rcx
3875            // Load the TLS Storage Array pointer
3876            // The gs segment register refers to the base address of the TEB on x64.
3877            // 0x58 is the offset in the TEB for the ThreadLocalStoragePointer member on x64:
3878            sink.put_data(&[
3879                0x65, 0x48, // REX.W
3880                0x8b, // MOV
3881                0x0c, 0x25, 0x58, // 0x58 - ThreadLocalStoragePointer offset
3882                0x00, 0x00, 0x00,
3883            ]);
3884
3885            // movq	(%rcx,%rax,8), %rax
3886            // Load the actual TLS entry for this thread.
3887            // Computes ThreadLocalStoragePointer + _tls_index*8
3888            sink.put_data(&[0x48, 0x8b, 0x04, 0xc1]);
3889
3890            // leaq	(%rax), %rax
3891            sink.put1(0x48);
3892            sink.put1(0x8d);
3893            sink.put1(0x80);
3894            emit_reloc(sink, Reloc::X86SecRel, symbol, 0);
3895            sink.put4(0); // offset
3896        }
3897
3898        Inst::Unwind { ref inst } => {
3899            sink.add_unwind(inst.clone());
3900        }
3901
3902        Inst::DummyUse { .. } => {
3903            // Nothing.
3904        }
3905    }
3906
3907    state.clear_post_insn();
3908}