polkavm_common/
simulator.rs

1#![allow(clippy::undocumented_unsafe_blocks)]
2#![allow(unsafe_code)]
3
4use crate::cast::cast;
5use crate::program::{InstructionFormat, InstructionSet, InstructionSetKind, Opcode, ParsingVisitor, RawReg, UNUSED_RAW_OPCODE};
6use crate::utils::{Bitness, BitnessT, GasVisitorT, B64};
7use alloc::string::String;
8use alloc::vec;
9
10#[cfg(feature = "simd")]
11use picosimd::amd64::{
12    avx2::i8x32,
13    avx2_composite::{i16x32, i32x32},
14    sse::i8x16,
15};
16
17#[cfg(not(feature = "simd"))]
18use picosimd::fallback::{i16x32, i32x32, i8x16, i8x32};
19
20#[cfg(not(all(feature = "simd", target_arch = "x86_64")))]
21macro_rules! unsafe_avx2 {
22    ($($t:tt)*) => { $($t)* }
23}
24
25#[cfg(all(feature = "simd", target_arch = "x86_64"))]
26macro_rules! unsafe_avx2 {
27    ($($t:tt)*) => { unsafe { $($t)* } }
28}
29
30#[derive(Copy, Clone, Debug, Hash)]
31#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
32pub enum CacheModel {
33    L1Hit,
34    L2Hit,
35    L3Hit,
36}
37
38/// The maximum number of instructions slots available per cycle.
39const MAX_DECODE_PER_CYCLE: u32 = 4;
40
41/// The maximum number of instructions in-flight.
42const REORDER_BUFFER_SIZE: usize = 32;
43
44/// The maximum number of cycles refunded at the end of each basic block.
45const GAS_COST_SLACK: i32 = 3;
46
47#[derive(Copy, Clone, Debug)]
48pub struct InstCost {
49    pub latency: i8,
50    pub decode_slots: u32,
51    pub alu_slots: u32,
52    pub mul_slots: u32,
53    pub div_slots: u32,
54    pub load_slots: u32,
55    pub store_slots: u32,
56}
57
58const MAX_ALU_SLOTS: u32 = 4;
59const MAX_LOAD_SLOTS: u32 = 4;
60const MAX_STORE_SLOTS: u32 = 4;
61const MAX_MUL_SLOTS: u32 = 1;
62const MAX_DIV_SLOTS: u32 = 1;
63
64const fn bits_needed(value: u32) -> u32 {
65    (value + 1).next_power_of_two().ilog2()
66}
67
68const ALU_BITS: u32 = bits_needed(MAX_ALU_SLOTS);
69const LOAD_BITS: u32 = bits_needed(MAX_LOAD_SLOTS);
70const STORE_BITS: u32 = bits_needed(MAX_STORE_SLOTS);
71const MUL_BITS: u32 = bits_needed(MAX_MUL_SLOTS);
72const DIV_BITS: u32 = bits_needed(MAX_DIV_SLOTS);
73
74#[allow(clippy::int_plus_one)]
75const _: () = {
76    assert!((1 << ALU_BITS) - 1 >= MAX_ALU_SLOTS);
77    assert!((1 << LOAD_BITS) - 1 >= MAX_LOAD_SLOTS);
78    assert!((1 << STORE_BITS) - 1 >= MAX_STORE_SLOTS);
79    assert!((1 << MUL_BITS) - 1 >= MAX_MUL_SLOTS);
80    assert!((1 << DIV_BITS) - 1 >= MAX_DIV_SLOTS);
81};
82
83const ALU_OFFSET: u32 = 0;
84const LOAD_OFFSET: u32 = ALU_OFFSET + ALU_BITS + 1;
85const STORE_OFFSET: u32 = LOAD_OFFSET + LOAD_BITS + 1;
86const MUL_OFFSET: u32 = STORE_OFFSET + STORE_BITS + 1;
87const DIV_OFFSET: u32 = MUL_OFFSET + MUL_BITS + 1;
88
89const RESOURCES_UNDERFLOW_MASK: u32 = (1 << (ALU_BITS + ALU_OFFSET))
90    | (1 << (LOAD_BITS + LOAD_OFFSET))
91    | (1 << (STORE_BITS + STORE_OFFSET))
92    | (1 << (MUL_BITS + MUL_OFFSET))
93    | (1 << (DIV_BITS + DIV_OFFSET));
94
95#[cfg(all(test, feature = "logging"))]
96struct DebugResources(u32);
97
98#[cfg(all(test, feature = "logging"))]
99impl core::fmt::Debug for DebugResources {
100    fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
101        fmt.debug_struct("Resources")
102            .field("alu", &((self.0 >> ALU_OFFSET) & ((1 << ALU_BITS) - 1)))
103            .field("load", &((self.0 >> LOAD_OFFSET) & ((1 << LOAD_BITS) - 1)))
104            .field("store", &((self.0 >> STORE_OFFSET) & ((1 << STORE_BITS) - 1)))
105            .field("mul", &((self.0 >> MUL_OFFSET) & ((1 << MUL_BITS) - 1)))
106            .field("div", &((self.0 >> DIV_OFFSET) & ((1 << DIV_BITS) - 1)))
107            .finish()
108    }
109}
110
111#[cfg(all(test, feature = "logging"))]
112struct DebugDeps([i32; 32]);
113
114#[cfg(all(test, feature = "logging"))]
115impl core::fmt::Debug for DebugDeps {
116    fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
117        fmt.write_str("{")?;
118        let iter = self.0.into_iter().enumerate().filter(|(_, deps)| *deps != 0);
119        let mut remaining = iter.clone().count();
120        for (nth, mut deps) in iter {
121            write!(fmt, "{nth}={{")?;
122            while deps != 0 {
123                let slot = deps.trailing_zeros();
124                deps &= !(1 << slot);
125                write!(fmt, "{slot}")?;
126                if deps != 0 {
127                    fmt.write_str(",")?;
128                }
129            }
130            fmt.write_str("}")?;
131            remaining -= 1;
132            if remaining > 0 {
133                fmt.write_str(", ")?;
134            }
135        }
136        fmt.write_str("}")?;
137
138        Ok(())
139    }
140}
141
142#[cfg(all(test, feature = "logging"))]
143struct DebugMask([i8; 32]);
144
145#[cfg(all(test, feature = "logging"))]
146impl core::fmt::Debug for DebugMask {
147    fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
148        fmt.write_str("{")?;
149        let iter = self.0.into_iter().enumerate().filter(|(_, mask)| *mask != 0);
150        let mut remaining = iter.clone().count();
151        for (nth, mask) in iter {
152            if mask == 0 {
153                continue;
154            } else if mask == -1 {
155                write!(fmt, "{nth}")?;
156            } else {
157                write!(fmt, "{nth}={{{mask}}}")?;
158            }
159
160            remaining -= 1;
161            if remaining > 0 {
162                fmt.write_str(", ")?;
163            }
164        }
165        fmt.write_str("}")?;
166
167        Ok(())
168    }
169}
170
171#[cfg(all(test, feature = "logging"))]
172struct DebugEntryByRegister([i8; 16]);
173
174#[cfg(all(test, feature = "logging"))]
175impl core::fmt::Debug for DebugEntryByRegister {
176    fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
177        fmt.write_str("{")?;
178        let mut remaining = self.0.iter().filter(|&&entry| entry != -1).count();
179        for (reg, entry) in crate::program::Reg::ALL.into_iter().zip(self.0.into_iter()) {
180            if entry == -1 {
181                continue;
182            }
183
184            write!(fmt, "{reg}={entry}")?;
185            remaining -= 1;
186            if remaining > 0 {
187                fmt.write_str(", ")?;
188            }
189        }
190        fmt.write_str("}")?;
191
192        Ok(())
193    }
194}
195
196#[cfg(all(test, feature = "logging"))]
197struct DebugCyclesRemaining([i8; 32]);
198
199#[cfg(all(test, feature = "logging"))]
200impl core::fmt::Debug for DebugCyclesRemaining {
201    fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
202        fmt.write_str("{")?;
203        let mut remaining = self.0.len();
204        for (index, count) in self.0.into_iter().enumerate() {
205            write!(fmt, "{index}={count}")?;
206            remaining -= 1;
207            if remaining > 0 {
208                fmt.write_str(", ")?;
209            }
210        }
211        fmt.write_str("}")?;
212
213        Ok(())
214    }
215}
216
217#[cfg(all(test, feature = "logging"))]
218struct DebugState([i8; 32]);
219
220#[cfg(all(test, feature = "logging"))]
221impl core::fmt::Debug for DebugState {
222    fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
223        fmt.write_str("{")?;
224        let iter = self.0.into_iter().enumerate().filter(|(_, state)| *state != 0);
225        let mut remaining = iter.clone().count();
226        for (nth, state) in iter {
227            remaining -= 1;
228            let state = match state {
229                1 => 'D',
230                2 => 'w',
231                3 => 'e',
232                4 => 'X',
233                _ => {
234                    write!(fmt, "{nth}={state}")?;
235                    if remaining > 0 {
236                        fmt.write_str(", ")?;
237                    }
238                    continue;
239                }
240            };
241
242            write!(fmt, "{nth}={state}")?;
243            if remaining > 0 {
244                fmt.write_str(", ")?;
245            }
246        }
247        fmt.write_str("}")?;
248
249        Ok(())
250    }
251}
252
253impl InstCost {
254    #[inline(always)]
255    const fn resources(&self) -> u32 {
256        assert!(self.alu_slots <= MAX_ALU_SLOTS);
257        assert!(self.mul_slots <= MAX_MUL_SLOTS);
258        assert!(self.div_slots <= MAX_DIV_SLOTS);
259        assert!(self.load_slots <= MAX_LOAD_SLOTS);
260        assert!(self.store_slots <= MAX_STORE_SLOTS);
261
262        (self.alu_slots << ALU_OFFSET)
263            | (self.load_slots << LOAD_OFFSET)
264            | (self.store_slots << STORE_OFFSET)
265            | (self.mul_slots << MUL_OFFSET)
266            | (self.div_slots << DIV_OFFSET)
267    }
268}
269
270const EMPTY_COST: InstCost = InstCost {
271    latency: 0,
272    decode_slots: 1,
273    alu_slots: 0,
274    mul_slots: 0,
275    div_slots: 0,
276    load_slots: 0,
277    store_slots: 0,
278};
279
280#[derive(Copy, Clone, Debug)]
281pub enum EventKind {
282    Decode,
283    WaitingForDependencies,
284    Executing,
285    Executed,
286    WaitingForRetirement,
287    Retired,
288}
289
290impl From<EventKind> for char {
291    fn from(kind: EventKind) -> char {
292        match kind {
293            EventKind::Decode => 'D',
294            EventKind::WaitingForDependencies => '=',
295            EventKind::Executing => 'e',
296            EventKind::Executed => 'E',
297            EventKind::WaitingForRetirement => '-',
298            EventKind::Retired => 'R',
299        }
300    }
301}
302
303pub trait Tracer: Sized {
304    // A flag to make it easier for the optimizer to get rid of dead code.
305    const SHOULD_CALL_ON_EVENT: bool;
306
307    fn should_enable_fast_forward(&self) -> bool {
308        true
309    }
310
311    fn on_event(&mut self, _cycle: u32, _instruction: u32, _event: EventKind) {}
312}
313
314impl Tracer for () {
315    const SHOULD_CALL_ON_EVENT: bool = false;
316}
317
318pub struct Simulator<'a, B, T: Tracer = ()> {
319    // The bytecode of the whole program.
320    code: &'a [u8],
321    /// The current cycle on which we're on.
322    cycles: u32,
323    /// The current instruction on which we're at when feeding code into the simulator.
324    instructions: u32,
325    /// Have we finished the simulation?
326    finished: bool,
327    /// Number of decode slots still available during this cycle.
328    decode_slots_remaining_this_cycle: u32,
329    /// Number of currently available resource, packed into a single field.
330    resources_available: u32,
331    /// The number of instructions currently in the reorder buffer.
332    instructions_in_flight: u32,
333    /// The offset of the first instruction in the reorder buffer (which is a circular buffer).
334    reorder_buffer_head: u32,
335    /// The next slot in the reorder buffer (which is a circular buffer).
336    reorder_buffer_tail: u32,
337    /// Which exact instruction does the reorder buffer contain at a given possition?
338    /// Used only when emitting events.
339    rob_instruction: [u32; REORDER_BUFFER_SIZE],
340    /// The state of each entry in the reorder buffer.
341    rob_state: i8x32,
342    /// The number of cycles remaining for each instruction in the reorder buffer.
343    rob_cycles_remaining: i8x32,
344    /// The resources required to start execution for each instruction in the reorder buffer.
345    rob_required_resources: i16x32,
346    /// A bitmask which contains each instruction's dependencies.
347    rob_dependencies: i32x32,
348    /// A bitmask which contains each instruction's reverse dependencies.
349    rob_depended_by: i32x32,
350    /// A bitmask of all of the registers which a given instruction in the reorder buffer has written into.
351    registers_written_by_rob_entry: i16x32,
352    /// The index of the reorder buffer entry which has last written into a given register.
353    rob_entry_by_register: i8x16,
354    /// The cache model used for memory accesses.
355    cache_model: CacheModel,
356    /// When set this overrides the branch costs to be always either cheap (== brach hit) or expensive (==branch miss).
357    force_branch_is_cheap: Option<bool>,
358
359    opcode_trap: u8,
360    opcode_unlikely: u8,
361
362    tracer: T,
363    _phantom: core::marker::PhantomData<B>,
364}
365
366impl<'a, B, T> Simulator<'a, B, T>
367where
368    T: Tracer,
369    B: BitnessT,
370{
371    pub fn new(code: &'a [u8], isa: InstructionSetKind, cache_model: CacheModel, tracer: T) -> Self {
372        unsafe_avx2! {
373            let mut simulator = Simulator {
374                code,
375                rob_instruction: [0; REORDER_BUFFER_SIZE],
376                cycles: 0,
377                instructions: 0,
378                finished: false,
379                decode_slots_remaining_this_cycle: 0,
380                resources_available: 0,
381                rob_state: i8x32::zero(),
382                rob_cycles_remaining: i8x32::zero(),
383                rob_required_resources: i16x32::zero(),
384                rob_dependencies: i32x32::zero(),
385                rob_depended_by: i32x32::zero(),
386                registers_written_by_rob_entry: i16x32::zero(),
387                rob_entry_by_register: i8x16::zero(),
388                reorder_buffer_tail: 0,
389                cache_model,
390                tracer,
391                force_branch_is_cheap: None,
392                instructions_in_flight: 0,
393                reorder_buffer_head: 0,
394                opcode_trap: isa.opcode_to_u8(Opcode::trap).unwrap_or(UNUSED_RAW_OPCODE),
395                opcode_unlikely: isa.opcode_to_u8(Opcode::unlikely).unwrap_or(UNUSED_RAW_OPCODE),
396                _phantom: core::marker::PhantomData,
397            };
398
399            simulator.clear();
400            simulator
401        }
402    }
403
404    pub fn set_force_branch_is_cheap(&mut self, value: Option<bool>) {
405        self.force_branch_is_cheap = value;
406    }
407
408    fn clear(&mut self) {
409        self.cycles = 0;
410        self.instructions = 0;
411        self.finished = false;
412        self.instructions_in_flight = 0;
413        self.decode_slots_remaining_this_cycle = MAX_DECODE_PER_CYCLE;
414        self.resources_available = InstCost {
415            alu_slots: MAX_ALU_SLOTS,
416            mul_slots: MAX_MUL_SLOTS,
417            div_slots: MAX_DIV_SLOTS,
418            load_slots: MAX_LOAD_SLOTS,
419            store_slots: MAX_STORE_SLOTS,
420            ..EMPTY_COST
421        }
422        .resources()
423            | RESOURCES_UNDERFLOW_MASK;
424
425        self.reorder_buffer_tail = 0;
426        self.reorder_buffer_head = 0;
427
428        unsafe_avx2! {
429            self.rob_entry_by_register = i8x16::negative_one();
430            self.rob_state = i8x32::zero();
431            self.rob_cycles_remaining = i8x32::zero();
432            self.rob_required_resources = i16x32::zero();
433            self.rob_dependencies = i32x32::zero();
434            self.rob_depended_by = i32x32::zero();
435            self.registers_written_by_rob_entry = i16x32::zero();
436        }
437
438        if T::SHOULD_CALL_ON_EVENT {
439            self.rob_instruction.fill(0);
440        }
441    }
442
443    fn emit_event(&mut self, slot: u32, kind: EventKind) {
444        if T::SHOULD_CALL_ON_EVENT {
445            self.tracer.on_event(self.cycles, self.rob_instruction[cast(slot).to_usize()], kind);
446        }
447    }
448
449    fn tick_cycle<const FAST_FORWARD: bool>(&mut self) {
450        unsafe_avx2! {
451            self.tick_cycle_avx2::<FAST_FORWARD>();
452        }
453    }
454
455    #[cfg_attr(all(feature = "simd", target_arch = "x86_64"), target_feature(enable = "avx2"))]
456    #[inline(never)]
457    fn emit_events_avx2(&mut self, mask: i8x32, event_kind: EventKind) {
458        if !T::SHOULD_CALL_ON_EVENT {
459            return;
460        }
461
462        let mut bits = mask.most_significant_bits();
463        while bits != 0 {
464            let slot = bits.trailing_zeros();
465            self.emit_event(slot, event_kind);
466            bits &= !(1 << slot);
467        }
468    }
469
470    fn instructions_in_flight(&self) -> u32 {
471        self.instructions_in_flight
472    }
473
474    #[cfg_attr(all(feature = "simd", target_arch = "x86_64"), target_feature(enable = "avx2"))]
475    fn tick_cycle_avx2<const FAST_FORWARD: bool>(&mut self) {
476        let state_decoding = i8x32::splat(1);
477        let state_waiting = i8x32::splat(2);
478        let state_executing = i8x32::splat(3);
479        let state_executed = i8x32::splat(4);
480
481        #[cfg(test)]
482        let original_state = self.rob_state;
483        #[cfg(test)]
484        let original_cycles_remaining = self.rob_cycles_remaining;
485        #[cfg(test)]
486        let original_dependencies = self.rob_dependencies;
487        #[cfg(test)]
488        let original_depended_by = self.rob_depended_by;
489        #[cfg(test)]
490        let original_entry_by_register = self.rob_entry_by_register;
491        #[cfg(test)]
492        let original_required_resources = self.rob_required_resources;
493        #[cfg(test)]
494        let original_decode_slots = self.decode_slots_remaining_this_cycle;
495        #[cfg(test)]
496        let original_reorder_buffer_head = self.reorder_buffer_head;
497        #[cfg(test)]
498        let original_resources_available = self.resources_available;
499        #[cfg(test)]
500        let original_instructions_in_flight = self.instructions_in_flight;
501
502        #[cfg(all(test, feature = "logging"))]
503        log::debug!(
504            "tick_cycle_avx2[{}]: state={:?}",
505            self.cycles,
506            DebugState(self.rob_state.to_array())
507        );
508        #[cfg(all(test, feature = "logging"))]
509        log::debug!(
510            "tick_cycle_avx2[{}]: cycles={:?}",
511            self.cycles,
512            DebugCyclesRemaining(self.rob_cycles_remaining.to_array())
513        );
514        #[cfg(all(test, feature = "logging"))]
515        log::debug!(
516            "tick_cycle_avx2[{}]: dependencies={:?}",
517            self.cycles,
518            DebugDeps(self.rob_dependencies.to_array())
519        );
520        #[cfg(all(test, feature = "logging"))]
521        log::debug!(
522            "tick_cycle_avx2[{}]: depended_by={:?}",
523            self.cycles,
524            DebugDeps(self.rob_depended_by.to_array())
525        );
526        #[cfg(all(test, feature = "logging"))]
527        log::debug!(
528            "tick_cycle_avx2[{}]: entry_by_register={:?}",
529            self.cycles,
530            DebugEntryByRegister(self.rob_entry_by_register.to_array())
531        );
532        #[cfg(all(test, feature = "logging"))]
533        log::debug!(
534            "tick_cycle_avx2[{}]: resources_available={:?}",
535            self.cycles,
536            DebugResources(self.resources_available)
537        );
538
539        debug_assert_eq!(
540            self.rob_state.simd_eq(i8x32::zero()).most_significant_bits().count_zeros(),
541            self.instructions_in_flight
542        );
543
544        // Retire unneeded instructions.
545        {
546            let is_waiting_for_retirement: i8x32 = self.rob_state.simd_eq(state_executed);
547            let leading_count_to_retire = is_waiting_for_retirement
548                .most_significant_bits()
549                .rotate_right(self.reorder_buffer_head)
550                .trailing_ones() as i32;
551
552            let is_retired_this_cycle = i8x32::from_i1x32_sext(
553                (cast(1_u64 << leading_count_to_retire).truncate_to_u32().wrapping_sub(1)).rotate_left(self.reorder_buffer_head) as i32,
554            );
555
556            // Mark every instruction which depended on instructions which just retired as not depending on them anymore.
557            self.rob_dependencies = self
558                .rob_dependencies
559                .and_not(i32x32::splat(is_retired_this_cycle.most_significant_bits()));
560
561            // Mark retired instructions as not depended by any other instruction.
562            self.rob_depended_by = self.rob_depended_by.and_not(i32x32::from_i8x32_sext(is_retired_this_cycle));
563
564            // Reset the state of retired instructions.
565            self.rob_state = self.rob_state.and_not(is_retired_this_cycle);
566
567            let retired_count = is_retired_this_cycle.most_significant_bits().count_ones();
568            #[cfg(all(test, feature = "logging"))]
569            if retired_count > 0 {
570                log::debug!(
571                    "tick_cycle_avx2[{}]: instructions_in_flight: {} -> {}",
572                    self.cycles,
573                    self.instructions_in_flight,
574                    self.instructions_in_flight - retired_count
575                );
576            }
577
578            self.instructions_in_flight -= retired_count;
579            self.reorder_buffer_head = (self.reorder_buffer_head + retired_count) % (REORDER_BUFFER_SIZE as u32);
580
581            self.emit_events_avx2(is_retired_this_cycle, EventKind::Retired);
582            self.emit_events_avx2(
583                is_waiting_for_retirement.and_not(is_retired_this_cycle),
584                EventKind::WaitingForRetirement,
585            );
586
587            debug_assert_eq!(
588                self.rob_state.simd_eq(i8x32::zero()).most_significant_bits().count_zeros(),
589                self.instructions_in_flight
590            );
591        }
592
593        {
594            const RESOURCES_UNDERFLOW_MASK_I16: i16 = RESOURCES_UNDERFLOW_MASK as u16 as i16;
595            let is_executed: i8x32 = self.rob_cycles_remaining.simd_lt(i8x32::splat(1));
596            let is_executed_mask: i32 = is_executed.most_significant_bits();
597            let has_no_dependencies: i8x32 = (self.rob_dependencies.and_not(i32x32::splat(is_executed_mask)))
598                .simd_eq(i32x32::zero())
599                .clamp_to_i8_range();
600
601            let mut is_waiting_to_start: i8x32 = self.rob_state.simd_eq(state_waiting) & has_no_dependencies;
602
603            for _ in 0..5 {
604                #[cfg(all(test, feature = "logging"))]
605                if is_waiting_to_start.most_significant_bits() != 0 {
606                    log::debug!(
607                        "tick_cycle_avx2[{}]: is_waiting_to_start={:?}",
608                        self.cycles,
609                        DebugMask(is_waiting_to_start.to_array())
610                    );
611                }
612                debug_assert_eq!(self.resources_available & RESOURCES_UNDERFLOW_MASK, RESOURCES_UNDERFLOW_MASK);
613
614                let new_resources: i16x32 = i16x32::splat(self.resources_available as i16) - self.rob_required_resources;
615                let have_enough_resources: i8x32 = (new_resources.and(i16x32::splat(RESOURCES_UNDERFLOW_MASK_I16)))
616                    .simd_eq(i16x32::splat(RESOURCES_UNDERFLOW_MASK_I16))
617                    .clamp_to_i8_range();
618                let have_enough_resources = have_enough_resources.and(is_waiting_to_start);
619                let mask = have_enough_resources.most_significant_bits().rotate_right(self.reorder_buffer_head);
620                let position = mask.trailing_zeros();
621                if position != 32 {
622                    let position = (position + self.reorder_buffer_head) % (REORDER_BUFFER_SIZE as u32);
623                    #[cfg(all(test, feature = "logging"))]
624                    log::debug!(
625                        "tick_cycle_avx2[{}]: starting: instruction={}, slot={}",
626                        self.cycles,
627                        self.rob_instruction[cast(position).to_usize()],
628                        position,
629                    );
630
631                    let resources_consumed = self.rob_required_resources.as_slice()[cast(position).to_usize()];
632                    self.resources_available -= resources_consumed as u32;
633                    self.rob_state.as_slice_mut()[cast(position).to_usize()] += 1;
634                    is_waiting_to_start.as_slice_mut()[cast(position).to_usize()] = 0;
635                }
636            }
637            self.emit_events_avx2(self.rob_state.simd_eq(state_waiting), EventKind::WaitingForDependencies);
638        }
639
640        // Progress execution. (executing -> executing, executing -> executed)
641        let mut cycle_count = 1;
642        {
643            let is_executing: i8x32 = self.rob_state.simd_eq(state_executing);
644            if FAST_FORWARD {
645                let max_cycles =
646                    ((self.rob_cycles_remaining & is_executing) | (is_executing ^ i8x32::negative_one())).horizontal_min_unsigned();
647                let max_cycles = cast(max_cycles).to_signed();
648
649                #[cfg(all(test, feature = "logging"))]
650                log::debug!("tick_cycle_avx2[{}]: max_cycles={}", self.cycles, max_cycles);
651                if max_cycles > 0 && self.decode_slots_remaining_this_cycle == MAX_DECODE_PER_CYCLE {
652                    cycle_count = max_cycles;
653                }
654            }
655
656            self.rob_cycles_remaining = self.rob_cycles_remaining.saturating_sub(i8x32::splat(cycle_count) & is_executing);
657
658            // Check which instructions just finished execution.
659            let is_execution_finished: i8x32 = self.rob_cycles_remaining.simd_eq(i8x32::zero()) & is_executing;
660            let is_execution_finished = is_execution_finished.to_i16x32_sext();
661
662            #[cfg(all(test, feature = "logging"))]
663            log::debug!(
664                "tick_cycle_avx2[{}]: is_execution_finished={:?}",
665                self.cycles,
666                is_execution_finished
667            );
668
669            let retired_register_writes: i16 = (self.registers_written_by_rob_entry & is_execution_finished).bitwise_reduce();
670            self.registers_written_by_rob_entry = self.registers_written_by_rob_entry.and_not(is_execution_finished);
671            self.rob_entry_by_register = self.rob_entry_by_register.or(i8x16::from_i1x16_sext(retired_register_writes));
672
673            // Release any resources used.
674            let resources_released = cast((self.rob_required_resources & is_execution_finished).wrapping_reduce()).to_unsigned();
675            self.resources_available += u32::from(resources_released);
676            self.rob_required_resources = self.rob_required_resources.and_not(is_execution_finished);
677
678            let is_last_cycle = self.rob_cycles_remaining.simd_eq(i8x32::negative_one());
679            let has_cycles_remaining = self.rob_cycles_remaining.simd_gt(i8x32::negative_one());
680            self.rob_state += i8x32::splat(1) & is_executing.and(is_last_cycle);
681            self.emit_events_avx2(is_executing.and(is_last_cycle), EventKind::Executed);
682            self.emit_events_avx2(is_executing.and(has_cycles_remaining), EventKind::Executing);
683        }
684
685        // Progress: decoding -> waiting
686        {
687            let is_decoding = self.rob_state.simd_eq(state_decoding);
688            self.rob_state += i8x32::splat(1) & is_decoding;
689        }
690
691        self.decode_slots_remaining_this_cycle = MAX_DECODE_PER_CYCLE;
692        self.cycles += cast(i32::from(cycle_count)).to_unsigned();
693
694        #[cfg(all(test, feature = "logging"))]
695        {
696            if self.rob_state != original_state {
697                log::debug!("tick_cycle_avx2[{}]: state changed!", self.cycles);
698            } else {
699                log::debug!("tick_cycle_avx2[{}]: state did NOT change!", self.cycles);
700            }
701        }
702
703        #[cfg(test)]
704        {
705            assert!(
706                self.instructions_in_flight != original_instructions_in_flight
707                    || self.reorder_buffer_head != original_reorder_buffer_head
708                    || self.decode_slots_remaining_this_cycle != original_decode_slots
709                    || self.resources_available != original_resources_available
710                    || self.rob_state != original_state
711                    || self.rob_cycles_remaining.max_signed(i8x32::negative_one())
712                        != original_cycles_remaining.max_signed(i8x32::negative_one())
713                    || self.rob_dependencies != original_dependencies
714                    || self.rob_depended_by != original_depended_by
715                    || self.rob_entry_by_register != original_entry_by_register
716                    || self.rob_required_resources != original_required_resources,
717                "made no progress"
718            );
719        }
720    }
721
722    #[inline(always)]
723    fn tick_cycle_if_cannot_decode(&mut self, decode_slots: u32) {
724        let mut should_tick =
725            self.decode_slots_remaining_this_cycle < decode_slots || self.instructions_in_flight() == (REORDER_BUFFER_SIZE as u32);
726        while should_tick {
727            self.tick_cycle::<false>();
728            should_tick = self.instructions_in_flight() == (REORDER_BUFFER_SIZE as u32);
729        }
730    }
731
732    #[inline(always)]
733    fn wait_until_empty(&mut self) {
734        #[cfg(all(test, feature = "logging"))]
735        if self.instructions_in_flight() > 0 {
736            log::debug!("wait_until_empty[{}]: starting fast forward!", self.cycles);
737        }
738
739        while self.instructions_in_flight() > 0 {
740            if self.tracer.should_enable_fast_forward() {
741                self.tick_cycle::<true>();
742            } else {
743                self.tick_cycle::<false>();
744            }
745        }
746    }
747
748    fn dispatch_generic(&mut self, dst: Option<RawReg>, src1: Option<RawReg>, src2: Option<RawReg>, cost: InstCost) {
749        #[cfg(all(test, feature = "logging"))]
750        log::debug!(
751            "dispatch[{}]: instruction={:?}, dst={:?}, src=[{:?}, {:?}], slots={}, latency={}, alu={}, load={}, store={}, mul={}, div={}",
752            self.cycles,
753            self.instructions,
754            dst.map(|reg| reg.get()),
755            src1.map(|reg| reg.get()),
756            src2.map(|reg| reg.get()),
757            cost.decode_slots,
758            cost.latency,
759            cost.alu_slots,
760            cost.load_slots,
761            cost.store_slots,
762            cost.mul_slots,
763            cost.div_slots,
764        );
765
766        debug_assert!(cost.latency >= 0);
767        unsafe_avx2! { self.dispatch_generic_avx2(dst, src1, src2, cost) }
768    }
769
770    #[cfg_attr(all(feature = "simd", target_arch = "x86_64"), target_feature(enable = "avx2"))]
771    fn dispatch_generic_avx2(&mut self, dst: Option<RawReg>, src1: Option<RawReg>, src2: Option<RawReg>, cost: InstCost) {
772        let dst = dst.map(|dst| dst.get());
773        let src1 = src1.map(|src1| src1.get());
774        let src2 = src2.map(|src2| src2.get());
775
776        self.tick_cycle_if_cannot_decode(cost.decode_slots);
777        if T::SHOULD_CALL_ON_EVENT {
778            self.tracer.on_event(self.cycles, self.instructions, EventKind::Decode);
779        }
780
781        let slot = self.reorder_buffer_tail;
782        self.reorder_buffer_tail = (self.reorder_buffer_tail + 1) % (REORDER_BUFFER_SIZE as u32);
783        let slot_mask = i8x32::zero().set_dynamic(cast(slot).truncate_to_u8(), cast(0xff_u8).to_signed());
784
785        self.rob_cycles_remaining = self.rob_cycles_remaining.set_dynamic(slot as u8, cost.latency);
786        self.rob_required_resources.as_slice_mut()[slot as usize] = cost.resources() as u16 as i16;
787
788        let dependency_1: Option<u32> = src1
789            .map(|src1| self.rob_entry_by_register.as_slice()[src1.to_usize()])
790            .map(i32::from)
791            .map(|x| cast(x).to_unsigned());
792        let dependency_2: Option<u32> = src2
793            .map(|src2| self.rob_entry_by_register.as_slice()[src2.to_usize()])
794            .map(i32::from)
795            .map(|x| cast(x).to_unsigned());
796        match (dependency_1, dependency_2) {
797            (Some(dependency_1), Some(dependency_2)) => {
798                let base_1 = (dependency_1 >> 31) ^ 1;
799                let base_2 = (dependency_2 >> 31) ^ 1;
800                let dependencies_mask = cast(base_1.wrapping_shl(dependency_1) | base_2.wrapping_shl(dependency_2)).to_signed();
801                self.rob_dependencies.as_slice_mut()[slot as usize] = dependencies_mask;
802                self.rob_depended_by.as_slice_mut()[(dependency_1 * base_1) as usize] |= cast(base_1 << slot).to_signed();
803                self.rob_depended_by.as_slice_mut()[(dependency_2 * base_2) as usize] |= cast(base_2 << slot).to_signed();
804            }
805            (Some(dependency), None) | (None, Some(dependency)) => {
806                let base = (dependency >> 31) ^ 1;
807                self.rob_dependencies.as_slice_mut()[slot as usize] = cast(base.wrapping_shl(dependency)).to_signed();
808                self.rob_depended_by.as_slice_mut()[(dependency * base) as usize] |= cast(base.wrapping_shl(slot)).to_signed();
809            }
810            (None, None) => {}
811        }
812
813        if let Some(dst) = dst {
814            let dst_mask: i16x32 = i16x32::splat(cast(cast(1_u32 << dst.to_u32()).truncate_to_u16()).to_signed());
815            self.registers_written_by_rob_entry =
816                self.registers_written_by_rob_entry.and_not(dst_mask) | (slot_mask.to_i16x32_sext() & dst_mask);
817            self.rob_entry_by_register.as_slice_mut()[dst.to_usize()] = cast(cast(slot).truncate_to_u8()).to_signed();
818        }
819
820        self.rob_state = self.rob_state.set_dynamic(slot as u8, 1);
821        if T::SHOULD_CALL_ON_EVENT {
822            self.rob_instruction[cast(slot).to_usize()] = self.instructions;
823        }
824
825        self.instructions_in_flight += 1;
826        self.decode_slots_remaining_this_cycle -= cost.decode_slots;
827        self.instructions += 1;
828
829        debug_assert_eq!(
830            self.rob_state.simd_eq(i8x32::zero()).most_significant_bits().count_zeros(),
831            self.instructions_in_flight
832        );
833    }
834
835    fn dispatch_move_reg_avx2(&mut self, dst: RawReg, src: RawReg) {
836        let dst = dst.get();
837        let src = src.get();
838
839        self.tick_cycle_if_cannot_decode(1);
840        if T::SHOULD_CALL_ON_EVENT {
841            self.tracer.on_event(self.cycles, self.instructions, EventKind::Decode);
842        }
843
844        let entry_by_register = self.rob_entry_by_register.as_slice_mut();
845        let registers_written_by_rob_entry = self.registers_written_by_rob_entry.as_slice_mut();
846        let old_slot = entry_by_register[dst.to_usize()];
847        if old_slot != -1 {
848            registers_written_by_rob_entry[old_slot as usize] &= !(1_i16 << dst.to_usize());
849        }
850
851        let new_slot = entry_by_register[src.to_usize()];
852        if new_slot != -1 {
853            registers_written_by_rob_entry[new_slot as usize] |= 1 << dst.to_usize();
854        }
855
856        entry_by_register[dst.to_usize()] = new_slot;
857        self.decode_slots_remaining_this_cycle -= 1;
858        self.instructions += 1;
859    }
860
861    fn dispatch_3op(&mut self, dst: RawReg, src1: RawReg, src2: RawReg, cost: InstCost) {
862        self.dispatch_generic(Some(dst), Some(src1), Some(src2), cost);
863    }
864
865    fn dispatch_2op(&mut self, dst: RawReg, src: RawReg, cost: InstCost) {
866        self.dispatch_generic(Some(dst), Some(src), None, cost);
867    }
868
869    fn dispatch_1op_dst(&mut self, dst: RawReg, cost: InstCost) {
870        self.dispatch_generic(Some(dst), None, None, cost);
871    }
872
873    fn dispatch_finish(&mut self, latency: i8) {
874        self.dispatch_generic(
875            None,
876            None,
877            None,
878            InstCost {
879                latency,
880                decode_slots: 1,
881                ..EMPTY_COST
882            },
883        );
884
885        self.wait_until_empty();
886        self.finished = true;
887    }
888
889    fn load_cost(&self) -> InstCost {
890        const L1_HIT: i8 = 4;
891        const L2_HIT: i8 = 25;
892        const L3_HIT: i8 = 37;
893
894        let latency = match self.cache_model {
895            CacheModel::L1Hit => L1_HIT,
896            CacheModel::L2Hit => L2_HIT,
897            CacheModel::L3Hit => L3_HIT,
898        };
899
900        InstCost {
901            latency,
902            decode_slots: 1,
903            alu_slots: 1,
904            load_slots: 1,
905            ..EMPTY_COST
906        }
907    }
908
909    fn dispatch_indirect_load(&mut self, dst: RawReg, base: RawReg, _offset: u32, _size: u32) {
910        self.dispatch_2op(dst, base, self.load_cost());
911    }
912
913    fn dispatch_load(&mut self, dst: RawReg, _offset: u32, _size: u32) {
914        self.dispatch_1op_dst(dst, self.load_cost());
915    }
916
917    #[allow(clippy::unused_self)]
918    fn store_cost(&self) -> InstCost {
919        InstCost {
920            latency: 25,
921            decode_slots: 1,
922            alu_slots: 1,
923            store_slots: 1,
924            ..EMPTY_COST
925        }
926    }
927
928    fn dispatch_store(&mut self, src: RawReg, _offset: u32, _size: u32) {
929        self.dispatch_generic(None, Some(src), None, self.store_cost());
930    }
931
932    fn dispatch_store_imm(&mut self, _offset: u32, _size: u32) {
933        self.dispatch_generic(None, None, None, self.store_cost());
934    }
935
936    fn dispatch_store_indirect(&mut self, src: RawReg, base: RawReg, _offset: u32, _size: u32) {
937        self.dispatch_generic(None, Some(src), Some(base), self.store_cost());
938    }
939
940    fn dispatch_store_imm_indirect(&mut self, base: RawReg, _offset: u32, _size: u32) {
941        self.dispatch_generic(None, Some(base), None, self.store_cost());
942    }
943
944    fn get_branch_cost(&self, offset: u32, args_length: u32, jump_offset: u32) -> i8 {
945        const BRANCH_PREDICTION_HIT_COST: i8 = 1;
946        const BRANCH_PREDICTION_MISS_COST: i8 = 20;
947
948        if let Some(is_hit) = self.force_branch_is_cheap {
949            return if is_hit {
950                BRANCH_PREDICTION_HIT_COST
951            } else {
952                BRANCH_PREDICTION_MISS_COST
953            };
954        }
955
956        if self
957            .code
958            .get(cast(offset).to_usize() + cast(args_length).to_usize())
959            .map(|&opcode| opcode == self.opcode_unlikely || opcode == self.opcode_trap)
960            .unwrap_or(true)
961        {
962            return BRANCH_PREDICTION_HIT_COST;
963        }
964
965        if self
966            .code
967            .get(cast(jump_offset).to_usize())
968            .map(|&opcode| opcode == self.opcode_unlikely || opcode == self.opcode_trap)
969            .unwrap_or(true)
970        {
971            return BRANCH_PREDICTION_HIT_COST;
972        }
973
974        BRANCH_PREDICTION_MISS_COST
975    }
976
977    fn dispatch_branch(&mut self, offset: u32, args_length: u32, s1: RawReg, s2: RawReg, jump_offset: u32) {
978        self.dispatch_generic(
979            None,
980            Some(s1),
981            Some(s2),
982            InstCost {
983                latency: self.get_branch_cost(offset, args_length, jump_offset),
984                decode_slots: 1,
985                alu_slots: 1,
986                ..EMPTY_COST
987            },
988        );
989        self.wait_until_empty();
990        self.finished = true;
991    }
992
993    fn dispatch_branch_imm(&mut self, offset: u32, args_length: u32, s: RawReg, jump_offset: u32) {
994        self.dispatch_generic(
995            None,
996            Some(s),
997            None,
998            InstCost {
999                latency: self.get_branch_cost(offset, args_length, jump_offset),
1000                decode_slots: 1,
1001                alu_slots: 1,
1002                ..EMPTY_COST
1003            },
1004        );
1005        self.wait_until_empty();
1006        self.finished = true;
1007    }
1008
1009    fn dispatch_trivial_2op_1c(&mut self, d: RawReg, s: RawReg) {
1010        self.dispatch_2op(
1011            d,
1012            s,
1013            InstCost {
1014                latency: 1,
1015                decode_slots: 1,
1016                alu_slots: 1,
1017                ..EMPTY_COST
1018            },
1019        );
1020    }
1021
1022    fn dispatch_trivial_2op_2c(&mut self, d: RawReg, s: RawReg) {
1023        self.dispatch_2op(
1024            d,
1025            s,
1026            InstCost {
1027                latency: 2,
1028                decode_slots: 1,
1029                alu_slots: 2,
1030                ..EMPTY_COST
1031            },
1032        );
1033    }
1034
1035    fn dispatch_simple_alu_2op(&mut self, d: RawReg, s: RawReg) {
1036        self.dispatch_2op(
1037            d,
1038            s,
1039            InstCost {
1040                latency: 1,
1041                decode_slots: 1 + u32::from(d.get() != s.get()),
1042                alu_slots: 1,
1043                ..EMPTY_COST
1044            },
1045        );
1046    }
1047
1048    fn dispatch_simple_alu_2op_32bit(&mut self, d: RawReg, s: RawReg) {
1049        self.dispatch_2op(
1050            d,
1051            s,
1052            InstCost {
1053                latency: 1 + i8::from(B::BITNESS == Bitness::B64),
1054                decode_slots: 1 + u32::from(d.get() != s.get()) + u32::from(B::BITNESS == Bitness::B64),
1055                alu_slots: 1,
1056                ..EMPTY_COST
1057            },
1058        );
1059    }
1060
1061    fn dispatch_simple_alu_3op(&mut self, d: RawReg, s1: RawReg, s2: RawReg) {
1062        self.dispatch_3op(
1063            d,
1064            s1,
1065            s2,
1066            InstCost {
1067                latency: 1,
1068                decode_slots: 1 + u32::from((d.get() != s1.get()) & (d.get() != s2.get())),
1069                alu_slots: 1,
1070                ..EMPTY_COST
1071            },
1072        );
1073    }
1074
1075    fn dispatch_simple_alu_3op_32(&mut self, d: RawReg, s1: RawReg, s2: RawReg) {
1076        self.dispatch_3op(
1077            d,
1078            s1,
1079            s2,
1080            InstCost {
1081                latency: 1 + i8::from(B::BITNESS == Bitness::B64),
1082                decode_slots: 1 + u32::from((d.get() != s1.get()) & (d.get() != s2.get())) + u32::from(B::BITNESS == Bitness::B64),
1083                alu_slots: 1,
1084                ..EMPTY_COST
1085            },
1086        );
1087    }
1088
1089    fn dispatch_shift(&mut self, d: RawReg, s1: RawReg, s2: RawReg) {
1090        self.dispatch_3op(
1091            d,
1092            s1,
1093            s2,
1094            InstCost {
1095                latency: 1,
1096                decode_slots: 2 + u32::from(d.get() != s1.get()),
1097                alu_slots: 1,
1098                ..EMPTY_COST
1099            },
1100        )
1101    }
1102
1103    fn dispatch_shift_32(&mut self, d: RawReg, s1: RawReg, s2: RawReg) {
1104        self.dispatch_3op(
1105            d,
1106            s1,
1107            s2,
1108            InstCost {
1109                latency: 1 + i8::from(B::BITNESS == Bitness::B64),
1110                decode_slots: 2 + u32::from(d.get() != s1.get()) + u32::from(B::BITNESS == Bitness::B64),
1111                alu_slots: 1,
1112                ..EMPTY_COST
1113            },
1114        )
1115    }
1116
1117    fn dispatch_shift_imm_alt(&mut self, d: RawReg, s: RawReg) {
1118        self.dispatch_2op(
1119            d,
1120            s,
1121            InstCost {
1122                latency: 1,
1123                decode_slots: 3,
1124                alu_slots: 1,
1125                ..EMPTY_COST
1126            },
1127        )
1128    }
1129
1130    fn dispatch_shift_imm_alt_32(&mut self, d: RawReg, s: RawReg) {
1131        self.dispatch_2op(
1132            d,
1133            s,
1134            InstCost {
1135                latency: 2,
1136                decode_slots: 4,
1137                alu_slots: 1,
1138                ..EMPTY_COST
1139            },
1140        )
1141    }
1142
1143    fn dispatch_compare(&mut self, d: RawReg, s1: RawReg, s2: RawReg) {
1144        self.dispatch_3op(
1145            d,
1146            s1,
1147            s2,
1148            InstCost {
1149                latency: 3,
1150                decode_slots: 3,
1151                alu_slots: 1,
1152                ..EMPTY_COST
1153            },
1154        )
1155    }
1156
1157    fn dispatch_compare_imm(&mut self, d: RawReg, s: RawReg) {
1158        self.dispatch_2op(
1159            d,
1160            s,
1161            InstCost {
1162                latency: 3,
1163                decode_slots: 3,
1164                alu_slots: 1,
1165                ..EMPTY_COST
1166            },
1167        )
1168    }
1169
1170    fn dispatch_cmov(&mut self, d: RawReg, s: RawReg, c: RawReg) {
1171        self.dispatch_3op(
1172            d,
1173            s,
1174            c,
1175            InstCost {
1176                latency: 2,
1177                decode_slots: 2,
1178                alu_slots: 1,
1179                ..EMPTY_COST
1180            },
1181        )
1182    }
1183
1184    fn dispatch_cmov_imm(&mut self, d: RawReg, c: RawReg) {
1185        self.dispatch_2op(
1186            d,
1187            c,
1188            InstCost {
1189                latency: 2,
1190                decode_slots: 3,
1191                alu_slots: 1,
1192                ..EMPTY_COST
1193            },
1194        )
1195    }
1196
1197    fn dispatch_min_max(&mut self, d: RawReg, s1: RawReg, s2: RawReg) {
1198        self.dispatch_3op(
1199            d,
1200            s1,
1201            s2,
1202            InstCost {
1203                latency: 3,
1204                decode_slots: 2 + u32::from((d.get() != s1.get()) & (d.get() != s2.get())),
1205                alu_slots: 1,
1206                ..EMPTY_COST
1207            },
1208        )
1209    }
1210
1211    fn dispatch_division(&mut self, d: RawReg, s1: RawReg, s2: RawReg) {
1212        self.dispatch_3op(
1213            d,
1214            s1,
1215            s2,
1216            InstCost {
1217                latency: 60,
1218                decode_slots: 4,
1219                alu_slots: 1,
1220                div_slots: 1,
1221                ..EMPTY_COST
1222            },
1223        )
1224    }
1225}
1226
1227impl<'a, B, T> GasVisitorT for Simulator<'a, B, T>
1228where
1229    B: BitnessT,
1230    T: Tracer,
1231{
1232    #[inline]
1233    fn take_block_cost(&mut self) -> Option<u32> {
1234        if (self.instructions_in_flight() == 0) & self.finished {
1235            let cycles = self.cycles;
1236            self.clear();
1237
1238            let cycles = cast((cast(cycles).to_signed() - GAS_COST_SLACK).max(1)).to_unsigned();
1239            Some(cycles)
1240        } else {
1241            None
1242        }
1243    }
1244
1245    fn is_at_start_of_basic_block(&self) -> bool {
1246        self.instructions == 0
1247    }
1248}
1249
1250impl<'a, B, T> ParsingVisitor for Simulator<'a, B, T>
1251where
1252    B: BitnessT,
1253    T: Tracer,
1254{
1255    type ReturnTy = ();
1256
1257    // Simple ALU instructions (3 op)
1258
1259    #[inline(always)]
1260    fn xor(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1261        self.dispatch_simple_alu_3op(d, s1, s2)
1262    }
1263
1264    #[inline(always)]
1265    fn and(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1266        self.dispatch_simple_alu_3op(d, s1, s2)
1267    }
1268
1269    #[inline(always)]
1270    fn or(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1271        self.dispatch_simple_alu_3op(d, s1, s2)
1272    }
1273
1274    #[inline(always)]
1275    fn add_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1276        self.dispatch_simple_alu_3op(d, s1, s2)
1277    }
1278
1279    #[inline(always)]
1280    fn sub_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1281        self.dispatch_simple_alu_3op(d, s1, s2)
1282    }
1283
1284    // Simple ALU instructions (3 op), 32-bit
1285
1286    #[inline(always)]
1287    fn add_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1288        self.dispatch_simple_alu_3op_32(d, s1, s2)
1289    }
1290
1291    #[inline(always)]
1292    fn sub_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1293        self.dispatch_simple_alu_3op_32(d, s1, s2)
1294    }
1295
1296    // Simple ALU instructions (2 op)
1297
1298    #[inline(always)]
1299    fn xor_imm(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg, _imm: u32) -> Self::ReturnTy {
1300        self.dispatch_simple_alu_2op(d, s)
1301    }
1302
1303    #[inline(always)]
1304    fn and_imm(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg, _imm: u32) -> Self::ReturnTy {
1305        self.dispatch_simple_alu_2op(d, s)
1306    }
1307
1308    #[inline(always)]
1309    fn or_imm(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg, _imm: u32) -> Self::ReturnTy {
1310        self.dispatch_simple_alu_2op(d, s)
1311    }
1312
1313    #[inline(always)]
1314    fn add_imm_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg, _imm: u32) -> Self::ReturnTy {
1315        // TODO: in 'd != s' case we use a single `lea`, see if modeling that makes sense
1316        self.dispatch_simple_alu_2op(d, s)
1317    }
1318
1319    #[inline(always)]
1320    fn shift_logical_right_imm_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1321        self.dispatch_simple_alu_2op(d, s1)
1322    }
1323
1324    #[inline(always)]
1325    fn shift_arithmetic_right_imm_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1326        self.dispatch_simple_alu_2op(d, s1)
1327    }
1328
1329    #[inline(always)]
1330    fn shift_logical_left_imm_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1331        self.dispatch_simple_alu_2op(d, s1)
1332    }
1333
1334    #[inline(always)]
1335    fn rotate_right_imm_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _c: u32) -> Self::ReturnTy {
1336        self.dispatch_simple_alu_2op(d, s1)
1337    }
1338
1339    #[inline(always)]
1340    fn reverse_byte(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg) -> Self::ReturnTy {
1341        self.dispatch_simple_alu_2op(d, s)
1342    }
1343
1344    // Simple ALU instructions (2 op), 32-bit
1345
1346    #[inline(always)]
1347    fn add_imm_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg, _imm: u32) -> Self::ReturnTy {
1348        // TODO: in 'd != s' case we use a single `lea`, see if modeling that makes sense
1349        self.dispatch_simple_alu_2op_32bit(d, s)
1350    }
1351
1352    #[inline(always)]
1353    fn shift_logical_right_imm_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1354        self.dispatch_simple_alu_2op_32bit(d, s1)
1355    }
1356
1357    #[inline(always)]
1358    fn shift_arithmetic_right_imm_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1359        self.dispatch_simple_alu_2op_32bit(d, s1)
1360    }
1361
1362    #[inline(always)]
1363    fn shift_logical_left_imm_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1364        self.dispatch_simple_alu_2op_32bit(d, s1)
1365    }
1366
1367    #[inline(always)]
1368    fn rotate_right_imm_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _c: u32) -> Self::ReturnTy {
1369        self.dispatch_simple_alu_2op_32bit(d, s1)
1370    }
1371
1372    // Trivial (2 op, 1 cycle)
1373
1374    #[inline(always)]
1375    fn count_leading_zero_bits_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg) -> Self::ReturnTy {
1376        self.dispatch_trivial_2op_1c(d, s)
1377    }
1378
1379    #[inline(always)]
1380    fn count_leading_zero_bits_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg) -> Self::ReturnTy {
1381        self.dispatch_trivial_2op_1c(d, s)
1382    }
1383
1384    #[inline(always)]
1385    fn count_set_bits_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg) -> Self::ReturnTy {
1386        self.dispatch_trivial_2op_1c(d, s)
1387    }
1388
1389    #[inline(always)]
1390    fn count_set_bits_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg) -> Self::ReturnTy {
1391        self.dispatch_trivial_2op_1c(d, s)
1392    }
1393
1394    #[inline(always)]
1395    fn sign_extend_8(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg) -> Self::ReturnTy {
1396        self.dispatch_trivial_2op_1c(d, s)
1397    }
1398
1399    #[inline(always)]
1400    fn sign_extend_16(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg) -> Self::ReturnTy {
1401        self.dispatch_trivial_2op_1c(d, s)
1402    }
1403
1404    #[inline(always)]
1405    fn zero_extend_16(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg) -> Self::ReturnTy {
1406        self.dispatch_trivial_2op_1c(d, s)
1407    }
1408
1409    // Trivial (2 op, 2 cycles)
1410
1411    #[inline(always)]
1412    fn count_trailing_zero_bits_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg) -> Self::ReturnTy {
1413        self.dispatch_trivial_2op_2c(d, s)
1414    }
1415
1416    #[inline(always)]
1417    fn count_trailing_zero_bits_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg) -> Self::ReturnTy {
1418        self.dispatch_trivial_2op_2c(d, s)
1419    }
1420
1421    // Shifts and rotates, 64-bit
1422
1423    #[inline(always)]
1424    fn shift_logical_right_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1425        self.dispatch_shift(d, s1, s2)
1426    }
1427
1428    #[inline(always)]
1429    fn shift_arithmetic_right_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1430        self.dispatch_shift(d, s1, s2)
1431    }
1432
1433    #[inline(always)]
1434    fn shift_logical_left_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1435        self.dispatch_shift(d, s1, s2)
1436    }
1437
1438    #[inline(always)]
1439    fn rotate_left_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1440        self.dispatch_shift(d, s1, s2)
1441    }
1442
1443    #[inline(always)]
1444    fn rotate_right_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1445        self.dispatch_shift(d, s1, s2)
1446    }
1447
1448    // Shifts and rotates, 32-bit
1449
1450    #[inline(always)]
1451    fn shift_logical_right_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1452        self.dispatch_shift_32(d, s1, s2)
1453    }
1454
1455    #[inline(always)]
1456    fn shift_arithmetic_right_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1457        self.dispatch_shift_32(d, s1, s2)
1458    }
1459
1460    #[inline(always)]
1461    fn shift_logical_left_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1462        self.dispatch_shift_32(d, s1, s2)
1463    }
1464
1465    #[inline(always)]
1466    fn rotate_left_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1467        self.dispatch_shift_32(d, s1, s2)
1468    }
1469
1470    #[inline(always)]
1471    fn rotate_right_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1472        self.dispatch_shift_32(d, s1, s2)
1473    }
1474
1475    // Shifts and rotates, alt
1476
1477    #[inline(always)]
1478    fn shift_logical_right_imm_alt_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s2: RawReg, _s1: u32) -> Self::ReturnTy {
1479        self.dispatch_shift_imm_alt(d, s2)
1480    }
1481
1482    #[inline(always)]
1483    fn shift_arithmetic_right_imm_alt_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s2: RawReg, _s1: u32) -> Self::ReturnTy {
1484        self.dispatch_shift_imm_alt(d, s2)
1485    }
1486
1487    #[inline(always)]
1488    fn shift_logical_left_imm_alt_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s2: RawReg, _s1: u32) -> Self::ReturnTy {
1489        self.dispatch_shift_imm_alt(d, s2)
1490    }
1491
1492    #[inline(always)]
1493    fn rotate_right_imm_alt_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg, _c: u32) -> Self::ReturnTy {
1494        self.dispatch_shift_imm_alt(d, s)
1495    }
1496
1497    // Shifts and rotates, alt (32-bit)
1498
1499    #[inline(always)]
1500    fn shift_logical_right_imm_alt_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s2: RawReg, _s1: u32) -> Self::ReturnTy {
1501        self.dispatch_shift_imm_alt_32(d, s2)
1502    }
1503
1504    #[inline(always)]
1505    fn shift_arithmetic_right_imm_alt_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s2: RawReg, _s1: u32) -> Self::ReturnTy {
1506        self.dispatch_shift_imm_alt_32(d, s2)
1507    }
1508
1509    #[inline(always)]
1510    fn shift_logical_left_imm_alt_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s2: RawReg, _s1: u32) -> Self::ReturnTy {
1511        self.dispatch_shift_imm_alt_32(d, s2)
1512    }
1513
1514    #[inline(always)]
1515    fn rotate_right_imm_alt_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg, _c: u32) -> Self::ReturnTy {
1516        self.dispatch_shift_imm_alt_32(d, s)
1517    }
1518
1519    // Register comparisons
1520
1521    #[inline(always)]
1522    fn set_less_than_unsigned(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1523        self.dispatch_compare(d, s1, s2)
1524    }
1525
1526    #[inline(always)]
1527    fn set_less_than_signed(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1528        self.dispatch_compare(d, s1, s2)
1529    }
1530
1531    // Register comparisons (immediate)
1532
1533    #[inline(always)]
1534    fn set_less_than_unsigned_imm(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1535        self.dispatch_compare_imm(d, s1)
1536    }
1537
1538    #[inline(always)]
1539    fn set_less_than_signed_imm(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1540        self.dispatch_compare_imm(d, s1)
1541    }
1542
1543    #[inline(always)]
1544    fn set_greater_than_unsigned_imm(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1545        self.dispatch_compare_imm(d, s1)
1546    }
1547
1548    #[inline(always)]
1549    fn set_greater_than_signed_imm(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1550        self.dispatch_compare_imm(d, s1)
1551    }
1552
1553    // Conditional moves
1554
1555    #[inline(always)]
1556    fn cmov_if_zero(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg, c: RawReg) -> Self::ReturnTy {
1557        self.dispatch_cmov(d, s, c)
1558    }
1559
1560    #[inline(always)]
1561    fn cmov_if_not_zero(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg, c: RawReg) -> Self::ReturnTy {
1562        self.dispatch_cmov(d, s, c)
1563    }
1564
1565    #[inline(always)]
1566    fn cmov_if_zero_imm(&mut self, _offset: u32, _args_length: u32, d: RawReg, c: RawReg, _s: u32) -> Self::ReturnTy {
1567        self.dispatch_cmov_imm(d, c)
1568    }
1569
1570    #[inline(always)]
1571    fn cmov_if_not_zero_imm(&mut self, _offset: u32, _args_length: u32, d: RawReg, c: RawReg, _s: u32) -> Self::ReturnTy {
1572        self.dispatch_cmov_imm(d, c)
1573    }
1574
1575    // Minimum/maximum
1576
1577    #[inline(always)]
1578    fn maximum(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1579        self.dispatch_min_max(d, s1, s2)
1580    }
1581
1582    #[inline(always)]
1583    fn maximum_unsigned(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1584        self.dispatch_min_max(d, s1, s2)
1585    }
1586
1587    #[inline(always)]
1588    fn minimum(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1589        self.dispatch_min_max(d, s1, s2)
1590    }
1591
1592    #[inline(always)]
1593    fn minimum_unsigned(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1594        self.dispatch_min_max(d, s1, s2)
1595    }
1596
1597    // Indirect loads
1598
1599    #[inline(always)]
1600    fn load_indirect_u8(&mut self, _offset: u32, _args_length: u32, dst: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1601        self.dispatch_indirect_load(dst, base, offset, 1)
1602    }
1603
1604    #[inline(always)]
1605    fn load_indirect_i8(&mut self, _offset: u32, _args_length: u32, dst: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1606        self.dispatch_indirect_load(dst, base, offset, 1)
1607    }
1608
1609    #[inline(always)]
1610    fn load_indirect_u16(&mut self, _offset: u32, _args_length: u32, dst: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1611        self.dispatch_indirect_load(dst, base, offset, 2)
1612    }
1613
1614    #[inline(always)]
1615    fn load_indirect_i16(&mut self, _offset: u32, _args_length: u32, dst: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1616        self.dispatch_indirect_load(dst, base, offset, 2)
1617    }
1618
1619    #[inline(always)]
1620    fn load_indirect_u32(&mut self, _offset: u32, _args_length: u32, dst: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1621        self.dispatch_indirect_load(dst, base, offset, 4)
1622    }
1623
1624    #[inline(always)]
1625    fn load_indirect_i32(&mut self, _offset: u32, _args_length: u32, dst: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1626        self.dispatch_indirect_load(dst, base, offset, 4)
1627    }
1628
1629    #[inline(always)]
1630    fn load_indirect_u64(&mut self, _offset: u32, _args_length: u32, dst: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1631        self.dispatch_indirect_load(dst, base, offset, 8)
1632    }
1633
1634    // Direct loads
1635
1636    #[inline(always)]
1637    fn load_u8(&mut self, _offset: u32, _args_length: u32, dst: RawReg, offset: u32) -> Self::ReturnTy {
1638        self.dispatch_load(dst, offset, 1)
1639    }
1640
1641    #[inline(always)]
1642    fn load_i8(&mut self, _offset: u32, _args_length: u32, dst: RawReg, offset: u32) -> Self::ReturnTy {
1643        self.dispatch_load(dst, offset, 1)
1644    }
1645
1646    #[inline(always)]
1647    fn load_u16(&mut self, _offset: u32, _args_length: u32, dst: RawReg, offset: u32) -> Self::ReturnTy {
1648        self.dispatch_load(dst, offset, 2)
1649    }
1650
1651    #[inline(always)]
1652    fn load_i16(&mut self, _offset: u32, _args_length: u32, dst: RawReg, offset: u32) -> Self::ReturnTy {
1653        self.dispatch_load(dst, offset, 2)
1654    }
1655
1656    #[inline(always)]
1657    fn load_u32(&mut self, _offset: u32, _args_length: u32, dst: RawReg, offset: u32) -> Self::ReturnTy {
1658        self.dispatch_load(dst, offset, 4)
1659    }
1660
1661    #[inline(always)]
1662    fn load_i32(&mut self, _offset: u32, _args_length: u32, dst: RawReg, offset: u32) -> Self::ReturnTy {
1663        self.dispatch_load(dst, offset, 4)
1664    }
1665
1666    #[inline(always)]
1667    fn load_u64(&mut self, _offset: u32, _args_length: u32, dst: RawReg, offset: u32) -> Self::ReturnTy {
1668        self.dispatch_load(dst, offset, 8)
1669    }
1670
1671    // Indirect stores (imm)
1672
1673    #[inline(always)]
1674    fn store_imm_indirect_u8(&mut self, _offset: u32, _args_length: u32, base: RawReg, offset: u32, _value: u32) -> Self::ReturnTy {
1675        self.dispatch_store_imm_indirect(base, offset, 1)
1676    }
1677
1678    #[inline(always)]
1679    fn store_imm_indirect_u16(&mut self, _offset: u32, _args_length: u32, base: RawReg, offset: u32, _value: u32) -> Self::ReturnTy {
1680        self.dispatch_store_imm_indirect(base, offset, 2)
1681    }
1682
1683    #[inline(always)]
1684    fn store_imm_indirect_u32(&mut self, _offset: u32, _args_length: u32, base: RawReg, offset: u32, _value: u32) -> Self::ReturnTy {
1685        self.dispatch_store_imm_indirect(base, offset, 4)
1686    }
1687
1688    #[inline(always)]
1689    fn store_imm_indirect_u64(&mut self, _offset: u32, _args_length: u32, base: RawReg, offset: u32, _value: u32) -> Self::ReturnTy {
1690        self.dispatch_store_imm_indirect(base, offset, 8)
1691    }
1692
1693    // Indirect stores
1694
1695    #[inline(always)]
1696    fn store_indirect_u8(&mut self, _offset: u32, _args_length: u32, src: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1697        self.dispatch_store_indirect(src, base, offset, 1)
1698    }
1699
1700    #[inline(always)]
1701    fn store_indirect_u16(&mut self, _offset: u32, _args_length: u32, src: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1702        self.dispatch_store_indirect(src, base, offset, 2)
1703    }
1704
1705    #[inline(always)]
1706    fn store_indirect_u32(&mut self, _offset: u32, _args_length: u32, src: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1707        self.dispatch_store_indirect(src, base, offset, 4)
1708    }
1709
1710    #[inline(always)]
1711    fn store_indirect_u64(&mut self, _offset: u32, _args_length: u32, src: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1712        self.dispatch_store_indirect(src, base, offset, 8)
1713    }
1714
1715    // Stores (imm)
1716
1717    #[inline(always)]
1718    fn store_imm_u8(&mut self, _offset: u32, _args_length: u32, offset: u32, _value: u32) -> Self::ReturnTy {
1719        self.dispatch_store_imm(offset, 1)
1720    }
1721
1722    #[inline(always)]
1723    fn store_imm_u16(&mut self, _offset: u32, _args_length: u32, offset: u32, _value: u32) -> Self::ReturnTy {
1724        self.dispatch_store_imm(offset, 2)
1725    }
1726
1727    #[inline(always)]
1728    fn store_imm_u32(&mut self, _offset: u32, _args_length: u32, offset: u32, _value: u32) -> Self::ReturnTy {
1729        self.dispatch_store_imm(offset, 4)
1730    }
1731
1732    #[inline(always)]
1733    fn store_imm_u64(&mut self, _offset: u32, _args_length: u32, offset: u32, _value: u32) -> Self::ReturnTy {
1734        self.dispatch_store_imm(offset, 8)
1735    }
1736
1737    // Stores
1738
1739    #[inline(always)]
1740    fn store_u8(&mut self, _offset: u32, _args_length: u32, src: RawReg, offset: u32) -> Self::ReturnTy {
1741        self.dispatch_store(src, offset, 1)
1742    }
1743
1744    #[inline(always)]
1745    fn store_u16(&mut self, _offset: u32, _args_length: u32, src: RawReg, offset: u32) -> Self::ReturnTy {
1746        self.dispatch_store(src, offset, 2)
1747    }
1748
1749    #[inline(always)]
1750    fn store_u32(&mut self, _offset: u32, _args_length: u32, src: RawReg, offset: u32) -> Self::ReturnTy {
1751        self.dispatch_store(src, offset, 4)
1752    }
1753
1754    #[inline(always)]
1755    fn store_u64(&mut self, _offset: u32, _args_length: u32, src: RawReg, offset: u32) -> Self::ReturnTy {
1756        self.dispatch_store(src, offset, 8)
1757    }
1758
1759    // Branches
1760
1761    #[inline(always)]
1762    fn branch_less_unsigned(&mut self, offset: u32, args_length: u32, s1: RawReg, s2: RawReg, imm: u32) -> Self::ReturnTy {
1763        self.dispatch_branch(offset, args_length, s1, s2, imm)
1764    }
1765
1766    #[inline(always)]
1767    fn branch_less_signed(&mut self, offset: u32, args_length: u32, s1: RawReg, s2: RawReg, imm: u32) -> Self::ReturnTy {
1768        self.dispatch_branch(offset, args_length, s1, s2, imm)
1769    }
1770
1771    #[inline(always)]
1772    fn branch_greater_or_equal_unsigned(&mut self, offset: u32, args_length: u32, s1: RawReg, s2: RawReg, imm: u32) -> Self::ReturnTy {
1773        self.dispatch_branch(offset, args_length, s1, s2, imm)
1774    }
1775
1776    #[inline(always)]
1777    fn branch_greater_or_equal_signed(&mut self, offset: u32, args_length: u32, s1: RawReg, s2: RawReg, imm: u32) -> Self::ReturnTy {
1778        self.dispatch_branch(offset, args_length, s1, s2, imm)
1779    }
1780
1781    #[inline(always)]
1782    fn branch_eq(&mut self, offset: u32, args_length: u32, s1: RawReg, s2: RawReg, imm: u32) -> Self::ReturnTy {
1783        self.dispatch_branch(offset, args_length, s1, s2, imm)
1784    }
1785
1786    #[inline(always)]
1787    fn branch_not_eq(&mut self, offset: u32, args_length: u32, s1: RawReg, s2: RawReg, imm: u32) -> Self::ReturnTy {
1788        self.dispatch_branch(offset, args_length, s1, s2, imm)
1789    }
1790
1791    // Branches (with immediate)
1792
1793    #[inline(always)]
1794    fn branch_eq_imm(&mut self, offset: u32, args_length: u32, s1: RawReg, _s2: u32, imm: u32) -> Self::ReturnTy {
1795        self.dispatch_branch_imm(offset, args_length, s1, imm);
1796    }
1797
1798    #[inline(always)]
1799    fn branch_not_eq_imm(&mut self, offset: u32, args_length: u32, s1: RawReg, _s2: u32, imm: u32) -> Self::ReturnTy {
1800        self.dispatch_branch_imm(offset, args_length, s1, imm);
1801    }
1802
1803    #[inline(always)]
1804    fn branch_less_unsigned_imm(&mut self, offset: u32, args_length: u32, s1: RawReg, _s2: u32, imm: u32) -> Self::ReturnTy {
1805        self.dispatch_branch_imm(offset, args_length, s1, imm);
1806    }
1807
1808    #[inline(always)]
1809    fn branch_less_signed_imm(&mut self, offset: u32, args_length: u32, s1: RawReg, _s2: u32, imm: u32) -> Self::ReturnTy {
1810        self.dispatch_branch_imm(offset, args_length, s1, imm);
1811    }
1812
1813    #[inline(always)]
1814    fn branch_greater_or_equal_unsigned_imm(&mut self, offset: u32, args_length: u32, s1: RawReg, _s2: u32, imm: u32) -> Self::ReturnTy {
1815        self.dispatch_branch_imm(offset, args_length, s1, imm);
1816    }
1817
1818    #[inline(always)]
1819    fn branch_greater_or_equal_signed_imm(&mut self, offset: u32, args_length: u32, s1: RawReg, _s2: u32, imm: u32) -> Self::ReturnTy {
1820        self.dispatch_branch_imm(offset, args_length, s1, imm);
1821    }
1822
1823    #[inline(always)]
1824    fn branch_less_or_equal_unsigned_imm(&mut self, offset: u32, args_length: u32, s1: RawReg, _s2: u32, imm: u32) -> Self::ReturnTy {
1825        self.dispatch_branch_imm(offset, args_length, s1, imm);
1826    }
1827
1828    #[inline(always)]
1829    fn branch_less_or_equal_signed_imm(&mut self, offset: u32, args_length: u32, s1: RawReg, _s2: u32, imm: u32) -> Self::ReturnTy {
1830        self.dispatch_branch_imm(offset, args_length, s1, imm);
1831    }
1832
1833    #[inline(always)]
1834    fn branch_greater_unsigned_imm(&mut self, offset: u32, args_length: u32, s1: RawReg, _s2: u32, imm: u32) -> Self::ReturnTy {
1835        self.dispatch_branch_imm(offset, args_length, s1, imm);
1836    }
1837
1838    #[inline(always)]
1839    fn branch_greater_signed_imm(&mut self, offset: u32, args_length: u32, s1: RawReg, _s2: u32, imm: u32) -> Self::ReturnTy {
1840        self.dispatch_branch_imm(offset, args_length, s1, imm);
1841    }
1842
1843    // Division
1844
1845    #[inline(always)]
1846    fn div_unsigned_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1847        self.dispatch_division(d, s1, s2)
1848    }
1849
1850    #[inline(always)]
1851    fn div_signed_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1852        self.dispatch_division(d, s1, s2)
1853    }
1854
1855    #[inline(always)]
1856    fn rem_unsigned_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1857        self.dispatch_division(d, s1, s2)
1858    }
1859
1860    #[inline(always)]
1861    fn rem_signed_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1862        self.dispatch_division(d, s1, s2)
1863    }
1864
1865    #[inline(always)]
1866    fn div_unsigned_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1867        self.dispatch_division(d, s1, s2)
1868    }
1869
1870    #[inline(always)]
1871    fn div_signed_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1872        self.dispatch_division(d, s1, s2)
1873    }
1874
1875    #[inline(always)]
1876    fn rem_unsigned_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1877        self.dispatch_division(d, s1, s2)
1878    }
1879
1880    #[inline(always)]
1881    fn rem_signed_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1882        self.dispatch_division(d, s1, s2)
1883    }
1884
1885    // Misc
1886
1887    #[inline(always)]
1888    fn and_inverted(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1889        // TODO: inaccurate
1890        self.dispatch_3op(
1891            d,
1892            s1,
1893            s2,
1894            InstCost {
1895                latency: 2,
1896                decode_slots: 3,
1897                alu_slots: 1,
1898                ..EMPTY_COST
1899            },
1900        )
1901    }
1902
1903    #[inline(always)]
1904    fn or_inverted(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1905        // TODO: inaccurate
1906        self.dispatch_3op(
1907            d,
1908            s1,
1909            s2,
1910            InstCost {
1911                latency: 2,
1912                decode_slots: 3,
1913                alu_slots: 1,
1914                ..EMPTY_COST
1915            },
1916        )
1917    }
1918
1919    #[inline(always)]
1920    fn xnor(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1921        self.dispatch_3op(
1922            d,
1923            s1,
1924            s2,
1925            InstCost {
1926                latency: 2,
1927                decode_slots: 2 + u32::from((d.get() != s1.get()) & (d.get() != s2.get())),
1928                alu_slots: 1,
1929                ..EMPTY_COST
1930            },
1931        );
1932    }
1933
1934    #[inline(always)]
1935    fn negate_and_add_imm_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1936        self.dispatch_2op(
1937            d,
1938            s1,
1939            InstCost {
1940                latency: 2,
1941                decode_slots: 3,
1942                alu_slots: 1,
1943                ..EMPTY_COST
1944            },
1945        )
1946    }
1947
1948    #[inline(always)]
1949    fn negate_and_add_imm_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1950        self.dispatch_2op(
1951            d,
1952            s1,
1953            InstCost {
1954                latency: 3,
1955                decode_slots: 4,
1956                alu_slots: 1,
1957                ..EMPTY_COST
1958            },
1959        )
1960    }
1961
1962    #[inline(always)]
1963    fn move_reg(&mut self, _offset: u32, _args_length: u32, dst: RawReg, src: RawReg) -> Self::ReturnTy {
1964        self.dispatch_move_reg_avx2(dst, src);
1965    }
1966
1967    #[inline(always)]
1968    fn load_imm(&mut self, _offset: u32, _args_length: u32, dst: RawReg, _value: u32) -> Self::ReturnTy {
1969        self.dispatch_1op_dst(
1970            dst,
1971            InstCost {
1972                latency: 1,
1973                decode_slots: 1,
1974                ..EMPTY_COST
1975            },
1976        )
1977    }
1978
1979    #[inline(always)]
1980    fn load_imm64(&mut self, _offset: u32, _args_length: u32, dst: RawReg, _value: u64) -> Self::ReturnTy {
1981        self.dispatch_1op_dst(
1982            dst,
1983            InstCost {
1984                latency: 1,
1985                decode_slots: 2,
1986                ..EMPTY_COST
1987            },
1988        );
1989    }
1990
1991    #[inline(always)]
1992    fn mul_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1993        self.dispatch_3op(
1994            d,
1995            s1,
1996            s2,
1997            InstCost {
1998                latency: 4,
1999                decode_slots: 2 + u32::from((d.get() != s1.get()) & (d.get() != s2.get())),
2000                alu_slots: 1,
2001                mul_slots: 1,
2002                ..EMPTY_COST
2003            },
2004        )
2005    }
2006
2007    #[inline(always)]
2008    fn mul_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
2009        self.dispatch_3op(
2010            d,
2011            s1,
2012            s2,
2013            InstCost {
2014                latency: 3,
2015                decode_slots: 1 + u32::from((d.get() != s1.get()) & (d.get() != s2.get())),
2016                alu_slots: 1,
2017                mul_slots: 1,
2018                ..EMPTY_COST
2019            },
2020        )
2021    }
2022
2023    #[inline(always)]
2024    fn mul_imm_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
2025        self.dispatch_2op(
2026            d,
2027            s1,
2028            InstCost {
2029                latency: 4,
2030                decode_slots: 2 + u32::from(d.get() != s1.get()),
2031                alu_slots: 1,
2032                mul_slots: 1,
2033                ..EMPTY_COST
2034            },
2035        )
2036    }
2037
2038    #[inline(always)]
2039    fn mul_imm_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
2040        self.dispatch_2op(
2041            d,
2042            s1,
2043            InstCost {
2044                latency: 3,
2045                decode_slots: 1 + u32::from(d.get() != s1.get()),
2046                alu_slots: 1,
2047                mul_slots: 1,
2048                ..EMPTY_COST
2049            },
2050        )
2051    }
2052
2053    #[inline(always)]
2054    fn mul_upper_signed_signed(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
2055        self.dispatch_3op(
2056            d,
2057            s1,
2058            s2,
2059            InstCost {
2060                latency: 4,
2061                decode_slots: 4,
2062                alu_slots: 1,
2063                mul_slots: 1,
2064                ..EMPTY_COST
2065            },
2066        )
2067    }
2068
2069    #[inline(always)]
2070    fn mul_upper_unsigned_unsigned(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
2071        self.dispatch_3op(
2072            d,
2073            s1,
2074            s2,
2075            InstCost {
2076                latency: 4,
2077                decode_slots: 4,
2078                alu_slots: 1,
2079                mul_slots: 1,
2080                ..EMPTY_COST
2081            },
2082        )
2083    }
2084
2085    #[inline(always)]
2086    fn mul_upper_signed_unsigned(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
2087        self.dispatch_3op(
2088            d,
2089            s1,
2090            s2,
2091            InstCost {
2092                latency: 6,
2093                decode_slots: 4,
2094                alu_slots: 1,
2095                mul_slots: 1,
2096                ..EMPTY_COST
2097            },
2098        )
2099    }
2100
2101    // End of block instructions
2102
2103    #[cold]
2104    fn invalid(&mut self, _offset: u32, _args_length: u32) -> Self::ReturnTy {
2105        self.dispatch_finish(2);
2106    }
2107
2108    #[inline(always)]
2109    fn trap(&mut self, _offset: u32, _args_length: u32) -> Self::ReturnTy {
2110        self.dispatch_finish(2);
2111    }
2112
2113    #[inline(always)]
2114    fn fallthrough(&mut self, _offset: u32, _args_length: u32) -> Self::ReturnTy {
2115        self.dispatch_finish(2);
2116    }
2117
2118    #[inline(always)]
2119    fn unlikely(&mut self, _offset: u32, _args_length: u32) -> Self::ReturnTy {
2120        self.dispatch_generic(
2121            None,
2122            None,
2123            None,
2124            InstCost {
2125                latency: 40,
2126                decode_slots: 1,
2127                ..EMPTY_COST
2128            },
2129        );
2130    }
2131
2132    #[inline(always)]
2133    fn jump(&mut self, _offset: u32, _args_length: u32, _target: u32) -> Self::ReturnTy {
2134        self.dispatch_finish(15);
2135    }
2136
2137    #[inline(always)]
2138    fn load_imm_and_jump(&mut self, _offset: u32, _args_length: u32, _ra: RawReg, _value: u32, _target: u32) -> Self::ReturnTy {
2139        self.dispatch_finish(15);
2140    }
2141
2142    #[inline(always)]
2143    fn jump_indirect(&mut self, _offset: u32, _args_length: u32, base: RawReg, _base_offset: u32) -> Self::ReturnTy {
2144        self.dispatch_generic(
2145            None,
2146            Some(base),
2147            None,
2148            InstCost {
2149                latency: 22,
2150                decode_slots: 1,
2151                ..EMPTY_COST
2152            },
2153        );
2154        self.wait_until_empty();
2155        self.finished = true;
2156    }
2157
2158    #[inline(always)]
2159    fn load_imm_and_jump_indirect(
2160        &mut self,
2161        _offset: u32,
2162        _args_length: u32,
2163        _ra: RawReg,
2164        base: RawReg,
2165        _value: u32,
2166        _base_offset: u32,
2167    ) -> Self::ReturnTy {
2168        self.dispatch_generic(
2169            None,
2170            Some(base),
2171            None,
2172            InstCost {
2173                latency: 22,
2174                decode_slots: 1,
2175                ..EMPTY_COST
2176            },
2177        );
2178        self.wait_until_empty();
2179        self.finished = true;
2180    }
2181
2182    // Special instructions
2183
2184    #[inline(always)]
2185    fn ecalli(&mut self, _offset: u32, _args_length: u32, _imm: u32) -> Self::ReturnTy {
2186        self.dispatch_generic(
2187            None,
2188            None,
2189            None,
2190            InstCost {
2191                latency: 100,
2192                decode_slots: 4,
2193                alu_slots: 1,
2194                ..EMPTY_COST
2195            },
2196        );
2197    }
2198
2199    #[inline(always)]
2200    fn sbrk(&mut self, _offset: u32, _args_length: u32, dst: RawReg, src: RawReg) -> Self::ReturnTy {
2201        // TODO: YOLO assigned
2202        self.dispatch_2op(
2203            dst,
2204            src,
2205            InstCost {
2206                latency: 100,
2207                decode_slots: 4,
2208                alu_slots: 1,
2209                ..EMPTY_COST
2210            },
2211        );
2212    }
2213
2214    #[inline(always)]
2215    fn memset(&mut self, _offset: u32, _args_length: u32) -> Self::ReturnTy {
2216        // TODO: YOLO assigned
2217        self.dispatch_generic(
2218            None,
2219            None,
2220            None,
2221            InstCost {
2222                latency: 100,
2223                decode_slots: 4,
2224                alu_slots: 1,
2225                ..EMPTY_COST
2226            },
2227        )
2228    }
2229}
2230
2231#[derive(Clone)]
2232#[non_exhaustive]
2233pub struct TimelineConfig<'a> {
2234    pub should_enable_fast_forward: bool,
2235    pub instruction_format: InstructionFormat<'a>,
2236}
2237
2238impl<'a> Default for TimelineConfig<'a> {
2239    fn default() -> Self {
2240        TimelineConfig {
2241            should_enable_fast_forward: false,
2242            instruction_format: InstructionFormat {
2243                is_64_bit: true,
2244                ..InstructionFormat::default()
2245            },
2246        }
2247    }
2248}
2249
2250pub fn timeline_for_instructions(
2251    code: &[u8],
2252    isa: InstructionSetKind,
2253    cache_model: CacheModel,
2254    instructions: &[crate::program::ParsedInstruction],
2255    config: TimelineConfig,
2256) -> (String, u32) {
2257    use alloc::collections::BTreeMap;
2258
2259    struct TimelineTracer<'a> {
2260        should_enable_fast_forward: bool,
2261        timeline: &'a mut BTreeMap<(u32, u32), EventKind>,
2262    }
2263
2264    impl<'a> Tracer for TimelineTracer<'a> {
2265        const SHOULD_CALL_ON_EVENT: bool = true;
2266
2267        fn should_enable_fast_forward(&self) -> bool {
2268            self.should_enable_fast_forward
2269        }
2270
2271        fn on_event(&mut self, cycle: u32, instruction: u32, event: EventKind) {
2272            match self.timeline.entry((cycle, instruction)) {
2273                alloc::collections::btree_map::Entry::Vacant(entry) => {
2274                    #[cfg(all(test, feature = "logging"))]
2275                    log::debug!(
2276                        "on_event[{cycle}]: instruction={instruction} '{}' (event={event:?})",
2277                        char::from(event)
2278                    );
2279                    entry.insert(event);
2280                }
2281                alloc::collections::btree_map::Entry::Occupied(entry) => {
2282                    panic!(
2283                        "duplicate timeline update: cycle={cycle} instruction={instruction} old_event={:?} new_event={event:?}",
2284                        entry.get()
2285                    );
2286                }
2287            }
2288        }
2289    }
2290
2291    let count = instructions
2292        .iter()
2293        .take_while(|inst| !inst.kind.opcode().starts_new_basic_block())
2294        .count();
2295
2296    let mut instructions = instructions[..(count + 1).min(instructions.len())].to_vec();
2297    if !instructions
2298        .last()
2299        .map(|instruction| instruction.kind.opcode().starts_new_basic_block())
2300        .unwrap_or(false)
2301    {
2302        let next_pc = instructions.last().map(|instruction| instruction.next_offset.0).unwrap_or(0);
2303        instructions.push(crate::program::ParsedInstruction {
2304            kind: crate::program::Instruction::invalid,
2305            offset: crate::program::ProgramCounter(next_pc),
2306            next_offset: crate::program::ProgramCounter(next_pc + 1),
2307        });
2308    }
2309
2310    let mut timeline_map = BTreeMap::new();
2311    let mut sim = Simulator::<B64, _>::new(
2312        code,
2313        isa,
2314        cache_model,
2315        TimelineTracer {
2316            should_enable_fast_forward: config.should_enable_fast_forward,
2317            timeline: &mut timeline_map,
2318        },
2319    );
2320
2321    for &instruction in &instructions {
2322        assert!(sim.take_block_cost().is_none());
2323        instruction.visit_parsing(&mut sim);
2324    }
2325
2326    let total_cycles = cast(sim.cycles).to_usize();
2327    let block_cost = sim.take_block_cost().unwrap();
2328    #[cfg(all(test, feature = "logging"))]
2329    log::debug!("Total cycles: {total_cycles}");
2330
2331    #[cfg(all(test, feature = "logging"))]
2332    log::debug!("Block cost: {block_cost}");
2333
2334    let mut timeline = vec!['.'; total_cycles * instructions.len()];
2335    for ((cycle, instruction), event) in timeline_map {
2336        let index = instruction as usize * total_cycles + cycle as usize;
2337        timeline[index] = char::from(event);
2338    }
2339
2340    let mut timeline_s = String::new();
2341    for (nth_instruction, instruction) in instructions.iter().enumerate() {
2342        use core::fmt::Write;
2343
2344        let line = &timeline[nth_instruction * total_cycles..(nth_instruction + 1) * total_cycles];
2345        timeline_s.extend(line.iter().copied());
2346        timeline_s.push_str("  ");
2347        writeln!(&mut timeline_s, "{}", instruction.display(&config.instruction_format)).unwrap();
2348    }
2349
2350    if config.should_enable_fast_forward {
2351        let mut timeline_new = String::with_capacity(timeline_s.len());
2352        let mut is_in_cycles = true;
2353        let mut last = '.';
2354        for mut ch in timeline_s.chars() {
2355            if ch == ' ' {
2356                is_in_cycles = false;
2357            } else if ch == '\n' {
2358                is_in_cycles = true;
2359                last = '.';
2360            } else if ch == '.' {
2361                if last != 'R' && last != 'D' && is_in_cycles {
2362                    ch = last;
2363                }
2364            } else {
2365                last = ch;
2366            }
2367            timeline_new.push(ch);
2368        }
2369        timeline_s = timeline_new;
2370    }
2371
2372    (timeline_s, block_cost)
2373}
2374
2375pub fn trap_cost(isa: InstructionSetKind, cache_model: CacheModel) -> u32 {
2376    let mut sim = Simulator::<B64, _>::new(&[], isa, cache_model, ());
2377    crate::program::ParsedInstruction {
2378        kind: crate::program::Instruction::trap,
2379        offset: crate::program::ProgramCounter(0),
2380        next_offset: crate::program::ProgramCounter(0),
2381    }
2382    .visit_parsing(&mut sim);
2383    sim.take_block_cost().unwrap()
2384}
2385
2386#[cfg(test)]
2387mod tests {
2388    use alloc::string::String;
2389    use alloc::vec::Vec;
2390
2391    use super::{timeline_for_instructions, CacheModel, TimelineConfig};
2392    use crate::assembler::assemble;
2393    use crate::program::{InstructionSetKind, ProgramBlob};
2394
2395    #[cfg(test)]
2396    fn test_config() -> CacheModel {
2397        CacheModel::L1Hit
2398    }
2399
2400    #[cfg(test)]
2401    fn assert_timeline(config: CacheModel, program: &str, expected_timeline: &str) {
2402        use crate::cast::cast;
2403
2404        let _ = env_logger::try_init();
2405
2406        let program = assemble(Some(InstructionSetKind::Latest64), program).unwrap();
2407        let blob = ProgramBlob::parse(program.into()).unwrap();
2408        let instructions: Vec<_> = blob.instructions().collect();
2409
2410        let (timeline_s, cycles) = timeline_for_instructions(
2411            blob.code(),
2412            InstructionSetKind::Latest64,
2413            config,
2414            &instructions,
2415            TimelineConfig::default(),
2416        );
2417        let mut expected_timeline_s = String::new();
2418        let mut expected_cycles = 0;
2419        for line in expected_timeline.lines() {
2420            let line = line.trim();
2421            if line.is_empty() {
2422                continue;
2423            }
2424            expected_timeline_s.push_str(line);
2425            expected_timeline_s.push('\n');
2426
2427            expected_cycles = expected_cycles.max(line.split("  ").next().unwrap().len() as u32);
2428        }
2429
2430        if timeline_s != expected_timeline_s {
2431            panic!("Timeline mismatch!\n\nExpected timeline:\n{expected_timeline_s}\nActual timeline:\n{timeline_s}");
2432        }
2433
2434        let expected_cycles = cast(expected_cycles).to_signed() - 3;
2435        assert_eq!(cast(cycles).to_signed(), expected_cycles);
2436
2437        #[cfg(feature = "logging")]
2438        log::debug!("Rerunning with fast-forward enabled...");
2439
2440        let timeline_config = TimelineConfig {
2441            should_enable_fast_forward: true,
2442            ..TimelineConfig::default()
2443        };
2444        let (timeline_ff_s, cycles_ff) =
2445            timeline_for_instructions(blob.code(), InstructionSetKind::Latest64, config, &instructions, timeline_config);
2446        assert_eq!(cycles_ff, cycles);
2447        if timeline_ff_s != expected_timeline_s {
2448            panic!("Timeline mismatch for fast-forward!\n\nExpected timeline:\n{expected_timeline_s}\nActual timeline:\n{timeline_ff_s}");
2449        }
2450    }
2451
2452    #[test]
2453    fn test_parallel_simple() {
2454        assert_timeline(
2455            test_config(),
2456            "
2457                a0 = a1 + a2
2458                a1 = a1 + a2
2459                trap
2460            ",
2461            "
2462                DeER.  a0 = a1 + a2
2463                DeER.  a1 = a1 + a2
2464                DeeER  trap
2465            ",
2466        );
2467    }
2468
2469    #[test]
2470    fn test_sequential_simple() {
2471        assert_timeline(
2472            test_config(),
2473            "
2474                a0 = a1 + a2
2475                a1 = a0 + a2
2476                trap
2477            ",
2478            "
2479                DeER..  a0 = a1 + a2
2480                D=eER.  a1 = a0 + a2
2481                .DeeER  trap
2482            ",
2483        );
2484    }
2485
2486    #[test]
2487    fn test_sequential_decode_limits() {
2488        assert_timeline(
2489            test_config(),
2490            "
2491                a0 = 0x12345678aabbccdd
2492                a1 = 0x12345678aabbccdd
2493                a2 = 0x12345678aabbccdd
2494                a3 = 0x12345678aabbccdd
2495                trap
2496            ",
2497            "
2498                DeER...  a0 = 0x12345678aabbccdd
2499                DeER...  a1 = 0x12345678aabbccdd
2500                .DeER..  a2 = 0x12345678aabbccdd
2501                .DeER..  a3 = 0x12345678aabbccdd
2502                ..DeeER  trap
2503            ",
2504        );
2505    }
2506
2507    #[test]
2508    fn test_resource_limits_mul() {
2509        assert_timeline(
2510            test_config(),
2511            "
2512                a0 = a1 * a2
2513                a1 = a3 * a4
2514                trap
2515            ",
2516            "
2517                DeeeER...  a0 = a1 * a2
2518                D===eeeER  a1 = a3 * a4
2519                .DeeE---R  trap
2520            ",
2521        );
2522    }
2523
2524    #[test]
2525    fn test_mul_with_dep() {
2526        assert_timeline(
2527            test_config(),
2528            "
2529                a0 = a1 + a2
2530                a4 = a0 * a3
2531                trap
2532            ",
2533            "
2534                DeER...  a0 = a1 + a2
2535                D=eeeER  a4 = a0 * a3
2536                .DeeE-R  trap
2537            ",
2538        );
2539    }
2540
2541    #[test]
2542    fn test_register_move() {
2543        assert_timeline(
2544            test_config(),
2545            "
2546                s0 = 1
2547                a0 = s0
2548                a1 = a0 + 1
2549                trap
2550            ",
2551            "
2552                DeER..  s0 = 0x1
2553                D.....  a0 = s0
2554                D=eER.  a1 = a0 + 0x1
2555                .DeeER  trap
2556            ",
2557        )
2558    }
2559
2560    #[test]
2561    fn test_memory_accesses() {
2562        assert_timeline(
2563            test_config(),
2564            "
2565                a0 = s1
2566                ra = u64 [sp + 0x30]
2567                s0 = u64 [sp + 0x28]
2568                s1 = u64 [sp + 0x20]
2569                sp = sp + 0x38
2570                ret
2571            ",
2572            "
2573                D............................  a0 = s1
2574                DeeeeER......................  ra = u64 [sp + 0x30]
2575                DeeeeER......................  s0 = u64 [sp + 0x28]
2576                DeeeeER......................  s1 = u64 [sp + 0x20]
2577                .DeE--R......................  sp = sp + 0x38
2578                .D===eeeeeeeeeeeeeeeeeeeeeeER  ret
2579            ",
2580        )
2581    }
2582
2583    #[test]
2584    fn test_empty() {
2585        assert_timeline(
2586            test_config(),
2587            "
2588                fallthrough
2589            ",
2590            "
2591                DeeER  fallthrough
2592            ",
2593        );
2594    }
2595
2596    #[test]
2597    fn test_overwrite_register() {
2598        assert_timeline(
2599            test_config(),
2600            "
2601                s0 = u64 [sp]
2602                s0 = a1 + a2
2603                s0 = u64 [s0]
2604                jump [s0]
2605            ",
2606            "
2607                DeeeeER.......................  s0 = u64 [sp]
2608                DeE---R.......................  s0 = a1 + a2
2609                D=eeeeER......................  s0 = u64 [s0]
2610                .D====eeeeeeeeeeeeeeeeeeeeeeER  jump [s0]
2611            ",
2612        );
2613    }
2614
2615    #[test]
2616    fn test_load_and_jump() {
2617        assert_timeline(
2618            test_config(),
2619            "
2620                @0:
2621                a2 = u8 [a0 + 11]
2622                jump @0 if a2 == 0
2623            ",
2624            "
2625                DeeeeER.  a2 = u8 [a0 + 0xb]
2626                D====eER  jump 0 if a2 == 0
2627            ",
2628        );
2629    }
2630
2631    #[test]
2632    fn test_complex() {
2633        assert_timeline(
2634            test_config(),
2635            "
2636                a2 = i16 [a0 + 0x6]
2637                a1 = a1 & 0x7
2638                a3 = 0x1
2639                a1 = a1 << 0x8
2640                a2 = a2 & 0xfffffffffffff8ff
2641                a1 = a1 | a2
2642                a2 = a1 + a3
2643                u8 [a0 + 0x2] = a3
2644                trap
2645            ",
2646            "
2647                DeeeeER.......................  a2 = i16 [a0 + 0x6]
2648                DeE---R.......................  a1 = a1 & 0x7
2649                DeE---R.......................  a3 = 0x1
2650                D=eE--R.......................  a1 = a1 << 0x8
2651                .D===eER......................  a2 = a2 & 0xfffffffffffff8ff
2652                .D====eER.....................  a1 = a1 | a2
2653                .D=====eER....................  a2 = a1 + a3
2654                ..DeeeeeeeeeeeeeeeeeeeeeeeeeER  u8 [a0 + 0x2] = a3
2655                ..DeeE-----------------------R  trap
2656            ",
2657        );
2658    }
2659
2660    #[test]
2661    fn test_even_more_complex() {
2662        assert_timeline(
2663            test_config(),
2664            "
2665                @0:
2666                i32 a1 = clz a0
2667                i32 a0 = a0 << a1
2668                a1 = a1 << 0x17
2669                i32 a2 = a0 >> 0x8
2670                a3 = a0 >> 0x7
2671                a3 = a3 & ~a2
2672                i32 a2 = a2 - a1
2673                a0 = a0 << 0x18
2674                a3 = a3 & 0x1
2675                i32 a0 = a0 - a3
2676                i32 a0 = a0 >> 0x1f
2677                a1 = a2 + 0x4e800000
2678                i32 a0 = a0 + a1
2679                a1 = 0x46008c00
2680                ra = 0x24
2681                jump @0
2682            ",
2683            "
2684                DeER.....................  i32 a1 = clz a0
2685                D=eeER...................  i32 a0 = a0 << a1
2686                .DeE-R...................  a1 = a1 << 0x17
2687                .D==eeER.................  i32 a2 = a0 >> 0x8
2688                ..D=eE-R.................  a3 = a0 >> 0x7
2689                ...D==eeER...............  a3 = a3 & ~a2
2690                ....D=eeER...............  i32 a2 = a2 - a1
2691                ....DeE--R...............  a0 = a0 << 0x18
2692                ....D===eER..............  a3 = a3 & 0x1
2693                .....D===eeER............  i32 a0 = a0 - a3
2694                .....D=====eeER..........  i32 a0 = a0 >> 0x1f
2695                ......D=eE----R..........  a1 = a2 + 0x4e800000
2696                ......D======eeER........  i32 a0 = a0 + a1
2697                .......DeE------R........  a1 = 0x46008c00
2698                .......DeE------R........  ra = 0x24
2699                .......DeeeeeeeeeeeeeeeER  jump 0
2700            ",
2701        );
2702    }
2703
2704    #[test]
2705    fn test_super_complex_l1() {
2706        assert_timeline(
2707            CacheModel::L1Hit,
2708            "
2709                @0:
2710                unlikely
2711                t1 = u8 [s0]
2712                a1 = u8 [s0 + 0x11]
2713                a2 = 0x172d0
2714                a3 = u8 [s0 + 0x16]
2715                t0 = sp + 0x58
2716                a1 = a1 << 0x3
2717                a1 = a1 + a2
2718                a2 = u8 [a1]
2719                a5 = u8 [a1 + 0x1]
2720                s1 = u8 [a1 + 0x2]
2721                a4 = u8 [a1 + 0x3]
2722                a3 = a3 + t0
2723                a5 = a5 << 0x8
2724                s1 = s1 << 0x10
2725                a4 = a4 << 0x18
2726                a2 = a2 | a5
2727                a5 = u8 [a1 + 0x4]
2728                a0 = u8 [a1 + 0x5]
2729                a4 = a4 | s1
2730                s1 = u8 [a1 + 0x6]
2731                a1 = u8 [a1 + 0x7]
2732                a0 = a0 << 0x8
2733                a0 = a0 | a5
2734                s1 = s1 << 0x10
2735                a1 = a1 << 0x18
2736                a1 = a1 | s1
2737                a2 = a2 | a4
2738                a0 = a0 | a1
2739                a1 = s0 - t1
2740                a0 = a0 << 0x20
2741                a0 = a0 | a2
2742                u64 [sp + 0x58] = a0
2743                a0 = u8 [a3]
2744                a1 = u8 [a1 + 0x4]
2745                a0 = a1 * a0
2746                a1 = u8 [s0 + 0x23]
2747                jump @0 if a1 != 0
2748            ",
2749            "
2750                DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER............................  unlikely
2751                DeeeeE------------------------------------R............................  t1 = u8 [s0]
2752                DeeeeE------------------------------------R............................  a1 = u8 [s0 + 0x11]
2753                DeE---------------------------------------R............................  a2 = 0x172d0
2754                .DeeeeE-----------------------------------R............................  a3 = u8 [s0 + 0x16]
2755                .DeE--------------------------------------R............................  t0 = sp + 0x58
2756                .D===eE-----------------------------------R............................  a1 = a1 << 0x3
2757                ..D===eE----------------------------------R............................  a1 = a1 + a2
2758                ..D====eeeeE------------------------------R............................  a2 = u8 [a1]
2759                ..D====eeeeE------------------------------R............................  a5 = u8 [a1 + 0x1]
2760                ..D====eeeeE------------------------------R............................  s1 = u8 [a1 + 0x2]
2761                ...D===eeeeE------------------------------R............................  a4 = u8 [a1 + 0x3]
2762                ...D==eE----------------------------------R............................  a3 = a3 + t0
2763                ...D=======eE-----------------------------R............................  a5 = a5 << 0x8
2764                ...D=======eE-----------------------------R............................  s1 = s1 << 0x10
2765                ....D======eE-----------------------------R............................  a4 = a4 << 0x18
2766                ....D=======eE----------------------------R............................  a2 = a2 | a5
2767                ....D======eeeeE--------------------------R............................  a5 = u8 [a1 + 0x4]
2768                ....D=======eeeeE-------------------------R............................  a0 = u8 [a1 + 0x5]
2769                .....D======eE----------------------------R............................  a4 = a4 | s1
2770                .....D=======eeeeE------------------------R............................  s1 = u8 [a1 + 0x6]
2771                .....D=======eeeeE------------------------R............................  a1 = u8 [a1 + 0x7]
2772                .....D==========eE------------------------R............................  a0 = a0 << 0x8
2773                ......D==========eE-----------------------R............................  a0 = a0 | a5
2774                ......D==========eE-----------------------R............................  s1 = s1 << 0x10
2775                ......D==========eE-----------------------R............................  a1 = a1 << 0x18
2776                ......D===========eE----------------------R............................  a1 = a1 | s1
2777                .......D=======eE-------------------------R............................  a2 = a2 | a4
2778                .......D===========eE---------------------R............................  a0 = a0 | a1
2779                .......D========eE------------------------R............................  a1 = s0 - t1
2780                ........D===========eE--------------------R............................  a0 = a0 << 0x20
2781                ........D============eE-------------------R............................  a0 = a0 | a2
2782                ...........................................DeeeeeeeeeeeeeeeeeeeeeeeeeER  u64 [sp + 0x58] = a0
2783                ...........................................DeeeeE---------------------R  a0 = u8 [a3]
2784                ...........................................DeeeeE---------------------R  a1 = u8 [a1 + 0x4]
2785                ...........................................D====eeeE------------------R  a0 = a1 * a0
2786                ............................................DeeeeE--------------------R  a1 = u8 [s0 + 0x23]
2787                ............................................D====eE-------------------R  jump 0 if a1 != 0
2788            ",
2789        );
2790    }
2791
2792    #[test]
2793    fn test_super_complex_l2() {
2794        assert_timeline(
2795            CacheModel::L2Hit,
2796            "
2797                @0:
2798                unlikely
2799                t1 = u8 [s0]
2800                a1 = u8 [s0 + 0x11]
2801                a2 = 0x172d0
2802                a3 = u8 [s0 + 0x16]
2803                t0 = sp + 0x58
2804                a1 = a1 << 0x3
2805                a1 = a1 + a2
2806                a2 = u8 [a1]
2807                a5 = u8 [a1 + 0x1]
2808                s1 = u8 [a1 + 0x2]
2809                a4 = u8 [a1 + 0x3]
2810                a3 = a3 + t0
2811                a5 = a5 << 0x8
2812                s1 = s1 << 0x10
2813                a4 = a4 << 0x18
2814                a2 = a2 | a5
2815                a5 = u8 [a1 + 0x4]
2816                a0 = u8 [a1 + 0x5]
2817                a4 = a4 | s1
2818                s1 = u8 [a1 + 0x6]
2819                a1 = u8 [a1 + 0x7]
2820                a0 = a0 << 0x8
2821                a0 = a0 | a5
2822                s1 = s1 << 0x10
2823                a1 = a1 << 0x18
2824                a1 = a1 | s1
2825                a2 = a2 | a4
2826                a0 = a0 | a1
2827                a1 = s0 - t1
2828                a0 = a0 << 0x20
2829                a0 = a0 | a2
2830                u64 [sp + 0x58] = a0
2831                a0 = u8 [a3]
2832                a1 = u8 [a1 + 0x4]
2833                a0 = a1 * a0
2834                a1 = u8 [s0 + 0x23]
2835                jump @0 if a1 != 0
2836            ",
2837            "
2838                DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER.....................................................................  unlikely
2839                DeeeeeeeeeeeeeeeeeeeeeeeeeE---------------R.....................................................................  t1 = u8 [s0]
2840                DeeeeeeeeeeeeeeeeeeeeeeeeeE---------------R.....................................................................  a1 = u8 [s0 + 0x11]
2841                DeE---------------------------------------R.....................................................................  a2 = 0x172d0
2842                .DeeeeeeeeeeeeeeeeeeeeeeeeeE--------------R.....................................................................  a3 = u8 [s0 + 0x16]
2843                .DeE--------------------------------------R.....................................................................  t0 = sp + 0x58
2844                .D========================eE--------------R.....................................................................  a1 = a1 << 0x3
2845                ..D========================eE-------------R.....................................................................  a1 = a1 + a2
2846                ..D=========================eeeeeeeeeeeeeeeeeeeeeeeeeER.........................................................  a2 = u8 [a1]
2847                ..D=========================eeeeeeeeeeeeeeeeeeeeeeeeeER.........................................................  a5 = u8 [a1 + 0x1]
2848                ..D=========================eeeeeeeeeeeeeeeeeeeeeeeeeER.........................................................  s1 = u8 [a1 + 0x2]
2849                ...D========================eeeeeeeeeeeeeeeeeeeeeeeeeER.........................................................  a4 = u8 [a1 + 0x3]
2850                ...D=======================eE-------------------------R.........................................................  a3 = a3 + t0
2851                ...D=================================================eER........................................................  a5 = a5 << 0x8
2852                ...D=================================================eER........................................................  s1 = s1 << 0x10
2853                ....D================================================eER........................................................  a4 = a4 << 0x18
2854                ....D=================================================eER.......................................................  a2 = a2 | a5
2855                ....D================================================eeeeeeeeeeeeeeeeeeeeeeeeeER................................  a5 = u8 [a1 + 0x4]
2856                ....D=================================================eeeeeeeeeeeeeeeeeeeeeeeeeER...............................  a0 = u8 [a1 + 0x5]
2857                .....D================================================eE------------------------R...............................  a4 = a4 | s1
2858                .....D=================================================eeeeeeeeeeeeeeeeeeeeeeeeeER..............................  s1 = u8 [a1 + 0x6]
2859                .....D=================================================eeeeeeeeeeeeeeeeeeeeeeeeeER..............................  a1 = u8 [a1 + 0x7]
2860                .....D=========================================================================eER..............................  a0 = a0 << 0x8
2861                ......D=========================================================================eER.............................  a0 = a0 | a5
2862                ......D=========================================================================eER.............................  s1 = s1 << 0x10
2863                ......D=========================================================================eER.............................  a1 = a1 << 0x18
2864                ......D==========================================================================eER............................  a1 = a1 | s1
2865                .......D======================================================================eE---R............................  a2 = a2 | a4
2866                .......D==========================================================================eER...........................  a0 = a0 | a1
2867                .......D==================eE--------------------------------------------------------R...........................  a1 = s0 - t1
2868                ........D==========================================================================eER..........................  a0 = a0 << 0x20
2869                ........D===========================================================================eER.........................  a0 = a0 | a2
2870                ...........................................D=========================================eeeeeeeeeeeeeeeeeeeeeeeeeER  u64 [sp + 0x58] = a0
2871                ...........................................D===================================eeeeeeeeeeeeeeeeeeeeeeeeeE------R  a0 = u8 [a3]
2872                ...........................................D=====================================eeeeeeeeeeeeeeeeeeeeeeeeeE----R  a1 = u8 [a1 + 0x4]
2873                ...........................................D==============================================================eeeE-R  a0 = a1 * a0
2874                ............................................D====================================eeeeeeeeeeeeeeeeeeeeeeeeeE----R  a1 = u8 [s0 + 0x23]
2875                ............................................D=============================================================eE---R  jump 0 if a1 != 0
2876            ",
2877        );
2878    }
2879
2880    #[test]
2881    fn test_l3_loads() {
2882        assert_timeline(CacheModel::L3Hit,
2883            "
2884                a0 = u64 [a0]
2885                a0 = u64 [a0]
2886                a0 = u64 [a0]
2887                a0 = u64 [a0]
2888                ret
2889            ",
2890            "
2891                DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER...............................................................................................................  a0 = u64 [a0]
2892                D=====================================eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER..........................................................................  a0 = u64 [a0]
2893                D==========================================================================eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER.....................................  a0 = u64 [a0]
2894                D===============================================================================================================eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER  a0 = u64 [a0]
2895                .DeeeeeeeeeeeeeeeeeeeeeeE-----------------------------------------------------------------------------------------------------------------------------R  ret
2896            ",
2897        )
2898    }
2899
2900    #[test]
2901    fn test_ecalli() {
2902        assert_timeline(
2903            test_config(),
2904            "
2905                ecalli 27
2906                ret
2907            ",
2908            "
2909                DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER  ecalli 27
2910                .DeeeeeeeeeeeeeeeeeeeeeeE-----------------------------------------------------------------------------R  ret
2911            ",
2912        );
2913    }
2914
2915    #[test]
2916    fn test_xor_and_shift() {
2917        assert_timeline(
2918            test_config(),
2919            "
2920                a1 = a1 ^ 0xffffffffffffffff
2921                a1 = a0 >> a1
2922                fallthrough
2923            ",
2924            "
2925                DeER..  a1 = a1 ^ 0xffffffffffffffff
2926                D=eER.  a1 = a0 >> a1
2927                .DeeER  fallthrough
2928            ",
2929        )
2930    }
2931
2932    #[test]
2933    fn test_move_reg_decode_slots() {
2934        assert_timeline(
2935            test_config(),
2936            "
2937                s0 = a1
2938                a0 = a1
2939                a1 = t0
2940                a2 = s1
2941                trap
2942            ",
2943            "
2944                D.....  s0 = a1
2945                D.....  a0 = a1
2946                D.....  a1 = t0
2947                D.....  a2 = s1
2948                .DeeER  trap
2949            ",
2950        )
2951    }
2952}