polkavm_common/
simulator.rs

1#![allow(clippy::undocumented_unsafe_blocks)]
2#![allow(unsafe_code)]
3
4use crate::cast::cast;
5use crate::program::{InstructionFormat, InstructionSet, InstructionSetKind, Opcode, ParsingVisitor, RawReg, UNUSED_RAW_OPCODE};
6use crate::utils::{Bitness, BitnessT, GasVisitorT, B64};
7use alloc::string::String;
8use alloc::vec;
9
10#[cfg(feature = "simd")]
11use picosimd::amd64::{
12    avx2::i8x32,
13    avx2_composite::{i16x32, i32x32},
14    sse::i8x16,
15};
16
17#[cfg(not(feature = "simd"))]
18use picosimd::fallback::{i16x32, i32x32, i8x16, i8x32};
19
20#[cfg(not(all(feature = "simd", target_arch = "x86_64")))]
21macro_rules! unsafe_avx2 {
22    ($($t:tt)*) => { $($t)* }
23}
24
25#[cfg(all(feature = "simd", target_arch = "x86_64"))]
26macro_rules! unsafe_avx2 {
27    ($($t:tt)*) => { unsafe { $($t)* } }
28}
29
30#[derive(Copy, Clone, Debug, Hash)]
31pub struct CacheModel {
32    pub memory_access_cost: i8,
33}
34
35#[allow(non_upper_case_globals)]
36impl CacheModel {
37    pub const L1Hit: Self = CacheModel { memory_access_cost: 4 };
38    pub const L2Hit: Self = CacheModel { memory_access_cost: 25 };
39    pub const L3Hit: Self = CacheModel { memory_access_cost: 37 };
40}
41
42#[cfg(feature = "arbitrary")]
43impl arbitrary::Arbitrary<'_> for CacheModel {
44    fn arbitrary(u: &mut arbitrary::Unstructured) -> arbitrary::Result<Self> {
45        Ok(Self {
46            memory_access_cost: <i8 as arbitrary::Arbitrary>::arbitrary(u)?.abs().max(1),
47        })
48    }
49}
50
51/// The maximum number of instructions slots available per cycle.
52const MAX_DECODE_PER_CYCLE: u32 = 4;
53
54/// The maximum number of instructions in-flight.
55const REORDER_BUFFER_SIZE: usize = 32;
56
57/// The maximum number of cycles refunded at the end of each basic block.
58const GAS_COST_SLACK: i32 = 3;
59
60#[derive(Copy, Clone, Debug)]
61pub struct InstCost {
62    pub latency: i8,
63    pub decode_slots: u32,
64    pub alu_slots: u32,
65    pub mul_slots: u32,
66    pub div_slots: u32,
67    pub load_slots: u32,
68    pub store_slots: u32,
69}
70
71const MAX_ALU_SLOTS: u32 = 4;
72const MAX_LOAD_SLOTS: u32 = 4;
73const MAX_STORE_SLOTS: u32 = 4;
74const MAX_MUL_SLOTS: u32 = 1;
75const MAX_DIV_SLOTS: u32 = 1;
76
77const fn bits_needed(value: u32) -> u32 {
78    (value + 1).next_power_of_two().ilog2()
79}
80
81const ALU_BITS: u32 = bits_needed(MAX_ALU_SLOTS);
82const LOAD_BITS: u32 = bits_needed(MAX_LOAD_SLOTS);
83const STORE_BITS: u32 = bits_needed(MAX_STORE_SLOTS);
84const MUL_BITS: u32 = bits_needed(MAX_MUL_SLOTS);
85const DIV_BITS: u32 = bits_needed(MAX_DIV_SLOTS);
86
87#[allow(clippy::int_plus_one)]
88const _: () = {
89    assert!((1 << ALU_BITS) - 1 >= MAX_ALU_SLOTS);
90    assert!((1 << LOAD_BITS) - 1 >= MAX_LOAD_SLOTS);
91    assert!((1 << STORE_BITS) - 1 >= MAX_STORE_SLOTS);
92    assert!((1 << MUL_BITS) - 1 >= MAX_MUL_SLOTS);
93    assert!((1 << DIV_BITS) - 1 >= MAX_DIV_SLOTS);
94};
95
96const ALU_OFFSET: u32 = 0;
97const LOAD_OFFSET: u32 = ALU_OFFSET + ALU_BITS + 1;
98const STORE_OFFSET: u32 = LOAD_OFFSET + LOAD_BITS + 1;
99const MUL_OFFSET: u32 = STORE_OFFSET + STORE_BITS + 1;
100const DIV_OFFSET: u32 = MUL_OFFSET + MUL_BITS + 1;
101
102const RESOURCES_UNDERFLOW_MASK: u32 = (1 << (ALU_BITS + ALU_OFFSET))
103    | (1 << (LOAD_BITS + LOAD_OFFSET))
104    | (1 << (STORE_BITS + STORE_OFFSET))
105    | (1 << (MUL_BITS + MUL_OFFSET))
106    | (1 << (DIV_BITS + DIV_OFFSET));
107
108#[cfg(all(test, feature = "logging"))]
109struct DebugResources(u32);
110
111#[cfg(all(test, feature = "logging"))]
112impl core::fmt::Debug for DebugResources {
113    fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
114        fmt.debug_struct("Resources")
115            .field("alu", &((self.0 >> ALU_OFFSET) & ((1 << ALU_BITS) - 1)))
116            .field("load", &((self.0 >> LOAD_OFFSET) & ((1 << LOAD_BITS) - 1)))
117            .field("store", &((self.0 >> STORE_OFFSET) & ((1 << STORE_BITS) - 1)))
118            .field("mul", &((self.0 >> MUL_OFFSET) & ((1 << MUL_BITS) - 1)))
119            .field("div", &((self.0 >> DIV_OFFSET) & ((1 << DIV_BITS) - 1)))
120            .finish()
121    }
122}
123
124#[cfg(all(test, feature = "logging"))]
125struct DebugDeps([i32; 32]);
126
127#[cfg(all(test, feature = "logging"))]
128impl core::fmt::Debug for DebugDeps {
129    fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
130        fmt.write_str("{")?;
131        let iter = self.0.into_iter().enumerate().filter(|(_, deps)| *deps != 0);
132        let mut remaining = iter.clone().count();
133        for (nth, mut deps) in iter {
134            write!(fmt, "{nth}={{")?;
135            while deps != 0 {
136                let slot = deps.trailing_zeros();
137                deps &= !(1 << slot);
138                write!(fmt, "{slot}")?;
139                if deps != 0 {
140                    fmt.write_str(",")?;
141                }
142            }
143            fmt.write_str("}")?;
144            remaining -= 1;
145            if remaining > 0 {
146                fmt.write_str(", ")?;
147            }
148        }
149        fmt.write_str("}")?;
150
151        Ok(())
152    }
153}
154
155#[cfg(all(test, feature = "logging"))]
156struct DebugMask([i8; 32]);
157
158#[cfg(all(test, feature = "logging"))]
159impl core::fmt::Debug for DebugMask {
160    fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
161        fmt.write_str("{")?;
162        let iter = self.0.into_iter().enumerate().filter(|(_, mask)| *mask != 0);
163        let mut remaining = iter.clone().count();
164        for (nth, mask) in iter {
165            if mask == 0 {
166                continue;
167            } else if mask == -1 {
168                write!(fmt, "{nth}")?;
169            } else {
170                write!(fmt, "{nth}={{{mask}}}")?;
171            }
172
173            remaining -= 1;
174            if remaining > 0 {
175                fmt.write_str(", ")?;
176            }
177        }
178        fmt.write_str("}")?;
179
180        Ok(())
181    }
182}
183
184#[cfg(all(test, feature = "logging"))]
185struct DebugEntryByRegister([i8; 16]);
186
187#[cfg(all(test, feature = "logging"))]
188impl core::fmt::Debug for DebugEntryByRegister {
189    fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
190        fmt.write_str("{")?;
191        let mut remaining = self.0.iter().filter(|&&entry| entry != -1).count();
192        for (reg, entry) in crate::program::Reg::ALL.into_iter().zip(self.0.into_iter()) {
193            if entry == -1 {
194                continue;
195            }
196
197            write!(fmt, "{reg}={entry}")?;
198            remaining -= 1;
199            if remaining > 0 {
200                fmt.write_str(", ")?;
201            }
202        }
203        fmt.write_str("}")?;
204
205        Ok(())
206    }
207}
208
209#[cfg(all(test, feature = "logging"))]
210struct DebugCyclesRemaining([i8; 32]);
211
212#[cfg(all(test, feature = "logging"))]
213impl core::fmt::Debug for DebugCyclesRemaining {
214    fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
215        fmt.write_str("{")?;
216        let mut remaining = self.0.len();
217        for (index, count) in self.0.into_iter().enumerate() {
218            write!(fmt, "{index}={count}")?;
219            remaining -= 1;
220            if remaining > 0 {
221                fmt.write_str(", ")?;
222            }
223        }
224        fmt.write_str("}")?;
225
226        Ok(())
227    }
228}
229
230#[cfg(all(test, feature = "logging"))]
231struct DebugState([i8; 32]);
232
233#[cfg(all(test, feature = "logging"))]
234impl core::fmt::Debug for DebugState {
235    fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
236        fmt.write_str("{")?;
237        let iter = self.0.into_iter().enumerate().filter(|(_, state)| *state != 0);
238        let mut remaining = iter.clone().count();
239        for (nth, state) in iter {
240            remaining -= 1;
241            let state = match state {
242                1 => 'D',
243                2 => 'w',
244                3 => 'e',
245                4 => 'X',
246                _ => {
247                    write!(fmt, "{nth}={state}")?;
248                    if remaining > 0 {
249                        fmt.write_str(", ")?;
250                    }
251                    continue;
252                }
253            };
254
255            write!(fmt, "{nth}={state}")?;
256            if remaining > 0 {
257                fmt.write_str(", ")?;
258            }
259        }
260        fmt.write_str("}")?;
261
262        Ok(())
263    }
264}
265
266impl InstCost {
267    #[inline(always)]
268    const fn resources(&self) -> u32 {
269        assert!(self.alu_slots <= MAX_ALU_SLOTS);
270        assert!(self.mul_slots <= MAX_MUL_SLOTS);
271        assert!(self.div_slots <= MAX_DIV_SLOTS);
272        assert!(self.load_slots <= MAX_LOAD_SLOTS);
273        assert!(self.store_slots <= MAX_STORE_SLOTS);
274
275        (self.alu_slots << ALU_OFFSET)
276            | (self.load_slots << LOAD_OFFSET)
277            | (self.store_slots << STORE_OFFSET)
278            | (self.mul_slots << MUL_OFFSET)
279            | (self.div_slots << DIV_OFFSET)
280    }
281}
282
283const EMPTY_COST: InstCost = InstCost {
284    latency: 0,
285    decode_slots: 1,
286    alu_slots: 0,
287    mul_slots: 0,
288    div_slots: 0,
289    load_slots: 0,
290    store_slots: 0,
291};
292
293#[derive(Copy, Clone, Debug)]
294pub enum EventKind {
295    Decode,
296    WaitingForDependencies,
297    Executing,
298    Executed,
299    WaitingForRetirement,
300    Retired,
301}
302
303impl From<EventKind> for char {
304    fn from(kind: EventKind) -> char {
305        match kind {
306            EventKind::Decode => 'D',
307            EventKind::WaitingForDependencies => '=',
308            EventKind::Executing => 'e',
309            EventKind::Executed => 'E',
310            EventKind::WaitingForRetirement => '-',
311            EventKind::Retired => 'R',
312        }
313    }
314}
315
316pub trait Tracer: Sized {
317    // A flag to make it easier for the optimizer to get rid of dead code.
318    const SHOULD_CALL_ON_EVENT: bool;
319
320    fn should_enable_fast_forward(&self) -> bool {
321        true
322    }
323
324    fn on_event(&mut self, _cycle: u32, _instruction: u32, _event: EventKind) {}
325}
326
327impl Tracer for () {
328    const SHOULD_CALL_ON_EVENT: bool = false;
329}
330
331pub struct Simulator<'a, B, T: Tracer = ()> {
332    // The bytecode of the whole program.
333    code: &'a [u8],
334    /// The current cycle on which we're on.
335    cycles: u32,
336    /// The current instruction on which we're at when feeding code into the simulator.
337    instructions: u32,
338    /// Have we finished the simulation?
339    finished: bool,
340    /// Number of decode slots still available during this cycle.
341    decode_slots_remaining_this_cycle: u32,
342    /// Number of currently available resource, packed into a single field.
343    resources_available: u32,
344    /// The number of instructions currently in the reorder buffer.
345    instructions_in_flight: u32,
346    /// The offset of the first instruction in the reorder buffer (which is a circular buffer).
347    reorder_buffer_head: u32,
348    /// The next slot in the reorder buffer (which is a circular buffer).
349    reorder_buffer_tail: u32,
350    /// Which exact instruction does the reorder buffer contain at a given possition?
351    /// Used only when emitting events.
352    rob_instruction: [u32; REORDER_BUFFER_SIZE],
353    /// The state of each entry in the reorder buffer.
354    rob_state: i8x32,
355    /// The number of cycles remaining for each instruction in the reorder buffer.
356    rob_cycles_remaining: i8x32,
357    /// The resources required to start execution for each instruction in the reorder buffer.
358    rob_required_resources: i16x32,
359    /// A bitmask which contains each instruction's dependencies.
360    rob_dependencies: i32x32,
361    /// A bitmask which contains each instruction's reverse dependencies.
362    rob_depended_by: i32x32,
363    /// A bitmask of all of the registers which a given instruction in the reorder buffer has written into.
364    registers_written_by_rob_entry: i16x32,
365    /// The index of the reorder buffer entry which has last written into a given register.
366    rob_entry_by_register: i8x16,
367    /// The cache model used for memory accesses.
368    cache_model: CacheModel,
369    /// When set this overrides the branch costs to be always either cheap (== brach hit) or expensive (==branch miss).
370    force_branch_is_cheap: Option<bool>,
371
372    opcode_trap: u8,
373    opcode_unlikely: u8,
374
375    tracer: T,
376    _phantom: core::marker::PhantomData<B>,
377}
378
379impl<'a, B, T> Simulator<'a, B, T>
380where
381    T: Tracer,
382    B: BitnessT,
383{
384    pub fn new(code: &'a [u8], isa: InstructionSetKind, cache_model: CacheModel, tracer: T) -> Self {
385        unsafe_avx2! {
386            let mut simulator = Simulator {
387                code,
388                rob_instruction: [0; REORDER_BUFFER_SIZE],
389                cycles: 0,
390                instructions: 0,
391                finished: false,
392                decode_slots_remaining_this_cycle: 0,
393                resources_available: 0,
394                rob_state: i8x32::zero(),
395                rob_cycles_remaining: i8x32::zero(),
396                rob_required_resources: i16x32::zero(),
397                rob_dependencies: i32x32::zero(),
398                rob_depended_by: i32x32::zero(),
399                registers_written_by_rob_entry: i16x32::zero(),
400                rob_entry_by_register: i8x16::zero(),
401                reorder_buffer_tail: 0,
402                cache_model,
403                tracer,
404                force_branch_is_cheap: None,
405                instructions_in_flight: 0,
406                reorder_buffer_head: 0,
407                opcode_trap: isa.opcode_to_u8(Opcode::trap).unwrap_or(UNUSED_RAW_OPCODE),
408                opcode_unlikely: isa.opcode_to_u8(Opcode::unlikely).unwrap_or(UNUSED_RAW_OPCODE),
409                _phantom: core::marker::PhantomData,
410            };
411
412            simulator.clear();
413            simulator
414        }
415    }
416
417    pub fn set_force_branch_is_cheap(&mut self, value: Option<bool>) {
418        self.force_branch_is_cheap = value;
419    }
420
421    fn clear(&mut self) {
422        self.cycles = 0;
423        self.instructions = 0;
424        self.finished = false;
425        self.instructions_in_flight = 0;
426        self.decode_slots_remaining_this_cycle = MAX_DECODE_PER_CYCLE;
427        self.resources_available = InstCost {
428            alu_slots: MAX_ALU_SLOTS,
429            mul_slots: MAX_MUL_SLOTS,
430            div_slots: MAX_DIV_SLOTS,
431            load_slots: MAX_LOAD_SLOTS,
432            store_slots: MAX_STORE_SLOTS,
433            ..EMPTY_COST
434        }
435        .resources()
436            | RESOURCES_UNDERFLOW_MASK;
437
438        self.reorder_buffer_tail = 0;
439        self.reorder_buffer_head = 0;
440
441        unsafe_avx2! {
442            self.rob_entry_by_register = i8x16::negative_one();
443            self.rob_state = i8x32::zero();
444            self.rob_cycles_remaining = i8x32::zero();
445            self.rob_required_resources = i16x32::zero();
446            self.rob_dependencies = i32x32::zero();
447            self.rob_depended_by = i32x32::zero();
448            self.registers_written_by_rob_entry = i16x32::zero();
449        }
450
451        if T::SHOULD_CALL_ON_EVENT {
452            self.rob_instruction.fill(0);
453        }
454    }
455
456    fn emit_event(&mut self, slot: u32, kind: EventKind) {
457        if T::SHOULD_CALL_ON_EVENT {
458            self.tracer.on_event(self.cycles, self.rob_instruction[cast(slot).to_usize()], kind);
459        }
460    }
461
462    fn tick_cycle<const FAST_FORWARD: bool>(&mut self) {
463        unsafe_avx2! {
464            self.tick_cycle_avx2::<FAST_FORWARD>();
465        }
466    }
467
468    #[cfg_attr(all(feature = "simd", target_arch = "x86_64"), target_feature(enable = "avx2"))]
469    #[inline(never)]
470    fn emit_events_avx2(&mut self, mask: i8x32, event_kind: EventKind) {
471        if !T::SHOULD_CALL_ON_EVENT {
472            return;
473        }
474
475        let mut bits = mask.most_significant_bits();
476        while bits != 0 {
477            let slot = bits.trailing_zeros();
478            self.emit_event(slot, event_kind);
479            bits &= !(1 << slot);
480        }
481    }
482
483    fn instructions_in_flight(&self) -> u32 {
484        self.instructions_in_flight
485    }
486
487    #[cfg_attr(all(feature = "simd", target_arch = "x86_64"), target_feature(enable = "avx2"))]
488    fn tick_cycle_avx2<const FAST_FORWARD: bool>(&mut self) {
489        let state_decoding = i8x32::splat(1);
490        let state_waiting = i8x32::splat(2);
491        let state_executing = i8x32::splat(3);
492        let state_executed = i8x32::splat(4);
493
494        #[cfg(test)]
495        let original_state = self.rob_state;
496        #[cfg(test)]
497        let original_cycles_remaining = self.rob_cycles_remaining;
498        #[cfg(test)]
499        let original_dependencies = self.rob_dependencies;
500        #[cfg(test)]
501        let original_depended_by = self.rob_depended_by;
502        #[cfg(test)]
503        let original_entry_by_register = self.rob_entry_by_register;
504        #[cfg(test)]
505        let original_required_resources = self.rob_required_resources;
506        #[cfg(test)]
507        let original_decode_slots = self.decode_slots_remaining_this_cycle;
508        #[cfg(test)]
509        let original_reorder_buffer_head = self.reorder_buffer_head;
510        #[cfg(test)]
511        let original_resources_available = self.resources_available;
512        #[cfg(test)]
513        let original_instructions_in_flight = self.instructions_in_flight;
514
515        #[cfg(all(test, feature = "logging"))]
516        log::debug!(
517            "tick_cycle_avx2[{}]: state={:?}",
518            self.cycles,
519            DebugState(self.rob_state.to_array())
520        );
521        #[cfg(all(test, feature = "logging"))]
522        log::debug!(
523            "tick_cycle_avx2[{}]: cycles={:?}",
524            self.cycles,
525            DebugCyclesRemaining(self.rob_cycles_remaining.to_array())
526        );
527        #[cfg(all(test, feature = "logging"))]
528        log::debug!(
529            "tick_cycle_avx2[{}]: dependencies={:?}",
530            self.cycles,
531            DebugDeps(self.rob_dependencies.to_array())
532        );
533        #[cfg(all(test, feature = "logging"))]
534        log::debug!(
535            "tick_cycle_avx2[{}]: depended_by={:?}",
536            self.cycles,
537            DebugDeps(self.rob_depended_by.to_array())
538        );
539        #[cfg(all(test, feature = "logging"))]
540        log::debug!(
541            "tick_cycle_avx2[{}]: entry_by_register={:?}",
542            self.cycles,
543            DebugEntryByRegister(self.rob_entry_by_register.to_array())
544        );
545        #[cfg(all(test, feature = "logging"))]
546        log::debug!(
547            "tick_cycle_avx2[{}]: resources_available={:?}",
548            self.cycles,
549            DebugResources(self.resources_available)
550        );
551
552        debug_assert_eq!(
553            self.rob_state.simd_eq(i8x32::zero()).most_significant_bits().count_zeros(),
554            self.instructions_in_flight
555        );
556
557        // Retire unneeded instructions.
558        {
559            let is_waiting_for_retirement: i8x32 = self.rob_state.simd_eq(state_executed);
560            let leading_count_to_retire = is_waiting_for_retirement
561                .most_significant_bits()
562                .rotate_right(self.reorder_buffer_head)
563                .trailing_ones() as i32;
564
565            let is_retired_this_cycle = i8x32::from_i1x32_sext(
566                (cast(1_u64 << leading_count_to_retire).truncate_to_u32().wrapping_sub(1)).rotate_left(self.reorder_buffer_head) as i32,
567            );
568
569            // Mark every instruction which depended on instructions which just retired as not depending on them anymore.
570            self.rob_dependencies = self
571                .rob_dependencies
572                .and_not(i32x32::splat(is_retired_this_cycle.most_significant_bits()));
573
574            // Mark retired instructions as not depended by any other instruction.
575            self.rob_depended_by = self.rob_depended_by.and_not(i32x32::from_i8x32_sext(is_retired_this_cycle));
576
577            // Reset the state of retired instructions.
578            self.rob_state = self.rob_state.and_not(is_retired_this_cycle);
579
580            let retired_count = is_retired_this_cycle.most_significant_bits().count_ones();
581            #[cfg(all(test, feature = "logging"))]
582            if retired_count > 0 {
583                log::debug!(
584                    "tick_cycle_avx2[{}]: instructions_in_flight: {} -> {}",
585                    self.cycles,
586                    self.instructions_in_flight,
587                    self.instructions_in_flight - retired_count
588                );
589            }
590
591            self.instructions_in_flight -= retired_count;
592            self.reorder_buffer_head = (self.reorder_buffer_head + retired_count) % (REORDER_BUFFER_SIZE as u32);
593
594            self.emit_events_avx2(is_retired_this_cycle, EventKind::Retired);
595            self.emit_events_avx2(
596                is_waiting_for_retirement.and_not(is_retired_this_cycle),
597                EventKind::WaitingForRetirement,
598            );
599
600            debug_assert_eq!(
601                self.rob_state.simd_eq(i8x32::zero()).most_significant_bits().count_zeros(),
602                self.instructions_in_flight
603            );
604        }
605
606        {
607            const RESOURCES_UNDERFLOW_MASK_I16: i16 = RESOURCES_UNDERFLOW_MASK as u16 as i16;
608            let is_executed: i8x32 = self.rob_cycles_remaining.simd_lt(i8x32::splat(1));
609            let is_executed_mask: i32 = is_executed.most_significant_bits();
610            let has_no_dependencies: i8x32 = (self.rob_dependencies.and_not(i32x32::splat(is_executed_mask)))
611                .simd_eq(i32x32::zero())
612                .clamp_to_i8_range();
613
614            let mut is_waiting_to_start: i8x32 = self.rob_state.simd_eq(state_waiting) & has_no_dependencies;
615
616            for _ in 0..5 {
617                #[cfg(all(test, feature = "logging"))]
618                if is_waiting_to_start.most_significant_bits() != 0 {
619                    log::debug!(
620                        "tick_cycle_avx2[{}]: is_waiting_to_start={:?}",
621                        self.cycles,
622                        DebugMask(is_waiting_to_start.to_array())
623                    );
624                }
625                debug_assert_eq!(self.resources_available & RESOURCES_UNDERFLOW_MASK, RESOURCES_UNDERFLOW_MASK);
626
627                let new_resources: i16x32 = i16x32::splat(self.resources_available as i16) - self.rob_required_resources;
628                let have_enough_resources: i8x32 = (new_resources.and(i16x32::splat(RESOURCES_UNDERFLOW_MASK_I16)))
629                    .simd_eq(i16x32::splat(RESOURCES_UNDERFLOW_MASK_I16))
630                    .clamp_to_i8_range();
631                let have_enough_resources = have_enough_resources.and(is_waiting_to_start);
632                let mask = have_enough_resources.most_significant_bits().rotate_right(self.reorder_buffer_head);
633                let position = mask.trailing_zeros();
634                if position != 32 {
635                    let position = (position + self.reorder_buffer_head) % (REORDER_BUFFER_SIZE as u32);
636                    #[cfg(all(test, feature = "logging"))]
637                    log::debug!(
638                        "tick_cycle_avx2[{}]: starting: instruction={}, slot={}",
639                        self.cycles,
640                        self.rob_instruction[cast(position).to_usize()],
641                        position,
642                    );
643
644                    let resources_consumed = self.rob_required_resources.as_slice()[cast(position).to_usize()];
645                    self.resources_available -= resources_consumed as u32;
646                    self.rob_state.as_slice_mut()[cast(position).to_usize()] += 1;
647                    is_waiting_to_start.as_slice_mut()[cast(position).to_usize()] = 0;
648                }
649            }
650            self.emit_events_avx2(self.rob_state.simd_eq(state_waiting), EventKind::WaitingForDependencies);
651        }
652
653        // Progress execution. (executing -> executing, executing -> executed)
654        let mut cycle_count = 1;
655        {
656            let is_executing: i8x32 = self.rob_state.simd_eq(state_executing);
657            if FAST_FORWARD {
658                let max_cycles =
659                    ((self.rob_cycles_remaining & is_executing) | (is_executing ^ i8x32::negative_one())).horizontal_min_unsigned();
660                let max_cycles = cast(max_cycles).to_signed();
661
662                #[cfg(all(test, feature = "logging"))]
663                log::debug!("tick_cycle_avx2[{}]: max_cycles={}", self.cycles, max_cycles);
664                if max_cycles > 0 && self.decode_slots_remaining_this_cycle == MAX_DECODE_PER_CYCLE {
665                    cycle_count = max_cycles;
666                }
667            }
668
669            self.rob_cycles_remaining = self.rob_cycles_remaining.saturating_sub(i8x32::splat(cycle_count) & is_executing);
670
671            // Check which instructions just finished execution.
672            let is_execution_finished: i8x32 = self.rob_cycles_remaining.simd_eq(i8x32::zero()) & is_executing;
673            let is_execution_finished = is_execution_finished.to_i16x32_sext();
674
675            #[cfg(all(test, feature = "logging"))]
676            log::debug!(
677                "tick_cycle_avx2[{}]: is_execution_finished={:?}",
678                self.cycles,
679                is_execution_finished
680            );
681
682            let retired_register_writes: i16 = (self.registers_written_by_rob_entry & is_execution_finished).bitwise_reduce();
683            self.registers_written_by_rob_entry = self.registers_written_by_rob_entry.and_not(is_execution_finished);
684            self.rob_entry_by_register = self.rob_entry_by_register.or(i8x16::from_i1x16_sext(retired_register_writes));
685
686            // Release any resources used.
687            let resources_released = cast((self.rob_required_resources & is_execution_finished).wrapping_reduce()).to_unsigned();
688            self.resources_available += u32::from(resources_released);
689            self.rob_required_resources = self.rob_required_resources.and_not(is_execution_finished);
690
691            let is_last_cycle = self.rob_cycles_remaining.simd_eq(i8x32::negative_one());
692            let has_cycles_remaining = self.rob_cycles_remaining.simd_gt(i8x32::negative_one());
693            self.rob_state += i8x32::splat(1) & is_executing.and(is_last_cycle);
694            self.emit_events_avx2(is_executing.and(is_last_cycle), EventKind::Executed);
695            self.emit_events_avx2(is_executing.and(has_cycles_remaining), EventKind::Executing);
696        }
697
698        // Progress: decoding -> waiting
699        {
700            let is_decoding = self.rob_state.simd_eq(state_decoding);
701            self.rob_state += i8x32::splat(1) & is_decoding;
702        }
703
704        self.decode_slots_remaining_this_cycle = MAX_DECODE_PER_CYCLE;
705        self.cycles += cast(i32::from(cycle_count)).to_unsigned();
706
707        #[cfg(all(test, feature = "logging"))]
708        {
709            if self.rob_state != original_state {
710                log::debug!("tick_cycle_avx2[{}]: state changed!", self.cycles);
711            } else {
712                log::debug!("tick_cycle_avx2[{}]: state did NOT change!", self.cycles);
713            }
714        }
715
716        #[cfg(test)]
717        {
718            assert!(
719                self.instructions_in_flight != original_instructions_in_flight
720                    || self.reorder_buffer_head != original_reorder_buffer_head
721                    || self.decode_slots_remaining_this_cycle != original_decode_slots
722                    || self.resources_available != original_resources_available
723                    || self.rob_state != original_state
724                    || self.rob_cycles_remaining.max_signed(i8x32::negative_one())
725                        != original_cycles_remaining.max_signed(i8x32::negative_one())
726                    || self.rob_dependencies != original_dependencies
727                    || self.rob_depended_by != original_depended_by
728                    || self.rob_entry_by_register != original_entry_by_register
729                    || self.rob_required_resources != original_required_resources,
730                "made no progress"
731            );
732        }
733    }
734
735    #[inline(always)]
736    fn tick_cycle_if_cannot_decode(&mut self, decode_slots: u32) {
737        let mut should_tick =
738            self.decode_slots_remaining_this_cycle < decode_slots || self.instructions_in_flight() == (REORDER_BUFFER_SIZE as u32);
739        while should_tick {
740            self.tick_cycle::<false>();
741            should_tick = self.instructions_in_flight() == (REORDER_BUFFER_SIZE as u32);
742        }
743    }
744
745    #[inline(always)]
746    fn wait_until_empty(&mut self) {
747        #[cfg(all(test, feature = "logging"))]
748        if self.instructions_in_flight() > 0 {
749            log::debug!("wait_until_empty[{}]: starting fast forward!", self.cycles);
750        }
751
752        while self.instructions_in_flight() > 0 {
753            if self.tracer.should_enable_fast_forward() {
754                self.tick_cycle::<true>();
755            } else {
756                self.tick_cycle::<false>();
757            }
758        }
759    }
760
761    fn dispatch_generic(&mut self, dst: Option<RawReg>, src1: Option<RawReg>, src2: Option<RawReg>, cost: InstCost) {
762        #[cfg(all(test, feature = "logging"))]
763        log::debug!(
764            "dispatch[{}]: instruction={:?}, dst={:?}, src=[{:?}, {:?}], slots={}, latency={}, alu={}, load={}, store={}, mul={}, div={}",
765            self.cycles,
766            self.instructions,
767            dst.map(|reg| reg.get()),
768            src1.map(|reg| reg.get()),
769            src2.map(|reg| reg.get()),
770            cost.decode_slots,
771            cost.latency,
772            cost.alu_slots,
773            cost.load_slots,
774            cost.store_slots,
775            cost.mul_slots,
776            cost.div_slots,
777        );
778
779        debug_assert!(cost.latency >= 0);
780        unsafe_avx2! { self.dispatch_generic_avx2(dst, src1, src2, cost) }
781    }
782
783    #[cfg_attr(all(feature = "simd", target_arch = "x86_64"), target_feature(enable = "avx2"))]
784    fn dispatch_generic_avx2(&mut self, dst: Option<RawReg>, src1: Option<RawReg>, src2: Option<RawReg>, cost: InstCost) {
785        let dst = dst.map(|dst| dst.get());
786        let src1 = src1.map(|src1| src1.get());
787        let src2 = src2.map(|src2| src2.get());
788
789        self.tick_cycle_if_cannot_decode(cost.decode_slots);
790        if T::SHOULD_CALL_ON_EVENT {
791            self.tracer.on_event(self.cycles, self.instructions, EventKind::Decode);
792        }
793
794        let slot = self.reorder_buffer_tail;
795        self.reorder_buffer_tail = (self.reorder_buffer_tail + 1) % (REORDER_BUFFER_SIZE as u32);
796        let slot_mask = i8x32::zero().set_dynamic(cast(slot).truncate_to_u8(), cast(0xff_u8).to_signed());
797
798        self.rob_cycles_remaining = self.rob_cycles_remaining.set_dynamic(slot as u8, cost.latency);
799        self.rob_required_resources.as_slice_mut()[slot as usize] = cost.resources() as u16 as i16;
800
801        let dependency_1: Option<u32> = src1
802            .map(|src1| self.rob_entry_by_register.as_slice()[src1.to_usize()])
803            .map(i32::from)
804            .map(|x| cast(x).to_unsigned());
805        let dependency_2: Option<u32> = src2
806            .map(|src2| self.rob_entry_by_register.as_slice()[src2.to_usize()])
807            .map(i32::from)
808            .map(|x| cast(x).to_unsigned());
809        match (dependency_1, dependency_2) {
810            (Some(dependency_1), Some(dependency_2)) => {
811                let base_1 = (dependency_1 >> 31) ^ 1;
812                let base_2 = (dependency_2 >> 31) ^ 1;
813                let dependencies_mask = cast(base_1.wrapping_shl(dependency_1) | base_2.wrapping_shl(dependency_2)).to_signed();
814                self.rob_dependencies.as_slice_mut()[slot as usize] = dependencies_mask;
815                self.rob_depended_by.as_slice_mut()[(dependency_1 * base_1) as usize] |= cast(base_1 << slot).to_signed();
816                self.rob_depended_by.as_slice_mut()[(dependency_2 * base_2) as usize] |= cast(base_2 << slot).to_signed();
817            }
818            (Some(dependency), None) | (None, Some(dependency)) => {
819                let base = (dependency >> 31) ^ 1;
820                self.rob_dependencies.as_slice_mut()[slot as usize] = cast(base.wrapping_shl(dependency)).to_signed();
821                self.rob_depended_by.as_slice_mut()[(dependency * base) as usize] |= cast(base.wrapping_shl(slot)).to_signed();
822            }
823            (None, None) => {}
824        }
825
826        if let Some(dst) = dst {
827            let dst_mask: i16x32 = i16x32::splat(cast(cast(1_u32 << dst.to_u32()).truncate_to_u16()).to_signed());
828            self.registers_written_by_rob_entry =
829                self.registers_written_by_rob_entry.and_not(dst_mask) | (slot_mask.to_i16x32_sext() & dst_mask);
830            self.rob_entry_by_register.as_slice_mut()[dst.to_usize()] = cast(cast(slot).truncate_to_u8()).to_signed();
831        }
832
833        self.rob_state = self.rob_state.set_dynamic(slot as u8, 1);
834        if T::SHOULD_CALL_ON_EVENT {
835            self.rob_instruction[cast(slot).to_usize()] = self.instructions;
836        }
837
838        self.instructions_in_flight += 1;
839        self.decode_slots_remaining_this_cycle -= cost.decode_slots;
840        self.instructions += 1;
841
842        debug_assert_eq!(
843            self.rob_state.simd_eq(i8x32::zero()).most_significant_bits().count_zeros(),
844            self.instructions_in_flight
845        );
846    }
847
848    fn dispatch_move_reg_avx2(&mut self, dst: RawReg, src: RawReg) {
849        let dst = dst.get();
850        let src = src.get();
851
852        self.tick_cycle_if_cannot_decode(1);
853        if T::SHOULD_CALL_ON_EVENT {
854            self.tracer.on_event(self.cycles, self.instructions, EventKind::Decode);
855        }
856
857        let entry_by_register = self.rob_entry_by_register.as_slice_mut();
858        let registers_written_by_rob_entry = self.registers_written_by_rob_entry.as_slice_mut();
859        let old_slot = entry_by_register[dst.to_usize()];
860        if old_slot != -1 {
861            registers_written_by_rob_entry[old_slot as usize] &= !(1_i16 << dst.to_usize());
862        }
863
864        let new_slot = entry_by_register[src.to_usize()];
865        if new_slot != -1 {
866            registers_written_by_rob_entry[new_slot as usize] |= 1 << dst.to_usize();
867        }
868
869        entry_by_register[dst.to_usize()] = new_slot;
870        self.decode_slots_remaining_this_cycle -= 1;
871        self.instructions += 1;
872    }
873
874    fn dispatch_3op(&mut self, dst: RawReg, src1: RawReg, src2: RawReg, cost: InstCost) {
875        self.dispatch_generic(Some(dst), Some(src1), Some(src2), cost);
876    }
877
878    fn dispatch_2op(&mut self, dst: RawReg, src: RawReg, cost: InstCost) {
879        self.dispatch_generic(Some(dst), Some(src), None, cost);
880    }
881
882    fn dispatch_1op_dst(&mut self, dst: RawReg, cost: InstCost) {
883        self.dispatch_generic(Some(dst), None, None, cost);
884    }
885
886    fn dispatch_finish(&mut self, latency: i8) {
887        self.dispatch_generic(
888            None,
889            None,
890            None,
891            InstCost {
892                latency,
893                decode_slots: 1,
894                ..EMPTY_COST
895            },
896        );
897
898        self.wait_until_empty();
899        self.finished = true;
900    }
901
902    fn load_cost(&self) -> InstCost {
903        InstCost {
904            latency: self.cache_model.memory_access_cost,
905            decode_slots: 1,
906            alu_slots: 1,
907            load_slots: 1,
908            ..EMPTY_COST
909        }
910    }
911
912    fn dispatch_indirect_load(&mut self, dst: RawReg, base: RawReg, _offset: u32, _size: u32) {
913        self.dispatch_2op(dst, base, self.load_cost());
914    }
915
916    fn dispatch_load(&mut self, dst: RawReg, _offset: u32, _size: u32) {
917        self.dispatch_1op_dst(dst, self.load_cost());
918    }
919
920    #[allow(clippy::unused_self)]
921    fn store_cost(&self) -> InstCost {
922        InstCost {
923            latency: 25,
924            decode_slots: 1,
925            alu_slots: 1,
926            store_slots: 1,
927            ..EMPTY_COST
928        }
929    }
930
931    fn dispatch_store(&mut self, src: RawReg, _offset: u32, _size: u32) {
932        self.dispatch_generic(None, Some(src), None, self.store_cost());
933    }
934
935    fn dispatch_store_imm(&mut self, _offset: u32, _size: u32) {
936        self.dispatch_generic(None, None, None, self.store_cost());
937    }
938
939    fn dispatch_store_indirect(&mut self, src: RawReg, base: RawReg, _offset: u32, _size: u32) {
940        self.dispatch_generic(None, Some(src), Some(base), self.store_cost());
941    }
942
943    fn dispatch_store_imm_indirect(&mut self, base: RawReg, _offset: u32, _size: u32) {
944        self.dispatch_generic(None, Some(base), None, self.store_cost());
945    }
946
947    fn get_branch_cost(&self, offset: u32, args_length: u32, jump_offset: u32) -> i8 {
948        const BRANCH_PREDICTION_HIT_COST: i8 = 1;
949        const BRANCH_PREDICTION_MISS_COST: i8 = 20;
950
951        if let Some(is_hit) = self.force_branch_is_cheap {
952            return if is_hit {
953                BRANCH_PREDICTION_HIT_COST
954            } else {
955                BRANCH_PREDICTION_MISS_COST
956            };
957        }
958
959        if self
960            .code
961            .get(cast(offset).to_usize() + cast(args_length).to_usize())
962            .map(|&opcode| opcode == self.opcode_unlikely || opcode == self.opcode_trap)
963            .unwrap_or(true)
964        {
965            return BRANCH_PREDICTION_HIT_COST;
966        }
967
968        if self
969            .code
970            .get(cast(jump_offset).to_usize())
971            .map(|&opcode| opcode == self.opcode_unlikely || opcode == self.opcode_trap)
972            .unwrap_or(true)
973        {
974            return BRANCH_PREDICTION_HIT_COST;
975        }
976
977        BRANCH_PREDICTION_MISS_COST
978    }
979
980    fn dispatch_branch(&mut self, offset: u32, args_length: u32, s1: RawReg, s2: RawReg, jump_offset: u32) {
981        self.dispatch_generic(
982            None,
983            Some(s1),
984            Some(s2),
985            InstCost {
986                latency: self.get_branch_cost(offset, args_length, jump_offset),
987                decode_slots: 1,
988                alu_slots: 1,
989                ..EMPTY_COST
990            },
991        );
992        self.wait_until_empty();
993        self.finished = true;
994    }
995
996    fn dispatch_branch_imm(&mut self, offset: u32, args_length: u32, s: RawReg, jump_offset: u32) {
997        self.dispatch_generic(
998            None,
999            Some(s),
1000            None,
1001            InstCost {
1002                latency: self.get_branch_cost(offset, args_length, jump_offset),
1003                decode_slots: 1,
1004                alu_slots: 1,
1005                ..EMPTY_COST
1006            },
1007        );
1008        self.wait_until_empty();
1009        self.finished = true;
1010    }
1011
1012    fn dispatch_trivial_2op_1c(&mut self, d: RawReg, s: RawReg) {
1013        self.dispatch_2op(
1014            d,
1015            s,
1016            InstCost {
1017                latency: 1,
1018                decode_slots: 1,
1019                alu_slots: 1,
1020                ..EMPTY_COST
1021            },
1022        );
1023    }
1024
1025    fn dispatch_trivial_2op_2c(&mut self, d: RawReg, s: RawReg) {
1026        self.dispatch_2op(
1027            d,
1028            s,
1029            InstCost {
1030                latency: 2,
1031                decode_slots: 1,
1032                alu_slots: 2,
1033                ..EMPTY_COST
1034            },
1035        );
1036    }
1037
1038    fn dispatch_simple_alu_2op(&mut self, d: RawReg, s: RawReg) {
1039        self.dispatch_2op(
1040            d,
1041            s,
1042            InstCost {
1043                latency: 1,
1044                decode_slots: 1 + u32::from(d.get() != s.get()),
1045                alu_slots: 1,
1046                ..EMPTY_COST
1047            },
1048        );
1049    }
1050
1051    fn dispatch_simple_alu_2op_32bit(&mut self, d: RawReg, s: RawReg) {
1052        self.dispatch_2op(
1053            d,
1054            s,
1055            InstCost {
1056                latency: 1 + i8::from(B::BITNESS == Bitness::B64),
1057                decode_slots: 1 + u32::from(d.get() != s.get()) + u32::from(B::BITNESS == Bitness::B64),
1058                alu_slots: 1,
1059                ..EMPTY_COST
1060            },
1061        );
1062    }
1063
1064    fn dispatch_simple_alu_3op(&mut self, d: RawReg, s1: RawReg, s2: RawReg) {
1065        self.dispatch_3op(
1066            d,
1067            s1,
1068            s2,
1069            InstCost {
1070                latency: 1,
1071                decode_slots: 1 + u32::from((d.get() != s1.get()) & (d.get() != s2.get())),
1072                alu_slots: 1,
1073                ..EMPTY_COST
1074            },
1075        );
1076    }
1077
1078    fn dispatch_simple_alu_3op_32(&mut self, d: RawReg, s1: RawReg, s2: RawReg) {
1079        self.dispatch_3op(
1080            d,
1081            s1,
1082            s2,
1083            InstCost {
1084                latency: 1 + i8::from(B::BITNESS == Bitness::B64),
1085                decode_slots: 1 + u32::from((d.get() != s1.get()) & (d.get() != s2.get())) + u32::from(B::BITNESS == Bitness::B64),
1086                alu_slots: 1,
1087                ..EMPTY_COST
1088            },
1089        );
1090    }
1091
1092    fn dispatch_shift(&mut self, d: RawReg, s1: RawReg, s2: RawReg) {
1093        self.dispatch_3op(
1094            d,
1095            s1,
1096            s2,
1097            InstCost {
1098                latency: 1,
1099                decode_slots: 2 + u32::from(d.get() != s1.get()),
1100                alu_slots: 1,
1101                ..EMPTY_COST
1102            },
1103        )
1104    }
1105
1106    fn dispatch_shift_32(&mut self, d: RawReg, s1: RawReg, s2: RawReg) {
1107        self.dispatch_3op(
1108            d,
1109            s1,
1110            s2,
1111            InstCost {
1112                latency: 1 + i8::from(B::BITNESS == Bitness::B64),
1113                decode_slots: 2 + u32::from(d.get() != s1.get()) + u32::from(B::BITNESS == Bitness::B64),
1114                alu_slots: 1,
1115                ..EMPTY_COST
1116            },
1117        )
1118    }
1119
1120    fn dispatch_shift_imm_alt(&mut self, d: RawReg, s: RawReg) {
1121        self.dispatch_2op(
1122            d,
1123            s,
1124            InstCost {
1125                latency: 1,
1126                decode_slots: 3,
1127                alu_slots: 1,
1128                ..EMPTY_COST
1129            },
1130        )
1131    }
1132
1133    fn dispatch_shift_imm_alt_32(&mut self, d: RawReg, s: RawReg) {
1134        self.dispatch_2op(
1135            d,
1136            s,
1137            InstCost {
1138                latency: 2,
1139                decode_slots: 4,
1140                alu_slots: 1,
1141                ..EMPTY_COST
1142            },
1143        )
1144    }
1145
1146    fn dispatch_compare(&mut self, d: RawReg, s1: RawReg, s2: RawReg) {
1147        self.dispatch_3op(
1148            d,
1149            s1,
1150            s2,
1151            InstCost {
1152                latency: 3,
1153                decode_slots: 3,
1154                alu_slots: 1,
1155                ..EMPTY_COST
1156            },
1157        )
1158    }
1159
1160    fn dispatch_compare_imm(&mut self, d: RawReg, s: RawReg) {
1161        self.dispatch_2op(
1162            d,
1163            s,
1164            InstCost {
1165                latency: 3,
1166                decode_slots: 3,
1167                alu_slots: 1,
1168                ..EMPTY_COST
1169            },
1170        )
1171    }
1172
1173    fn dispatch_cmov(&mut self, d: RawReg, s: RawReg, c: RawReg) {
1174        self.dispatch_3op(
1175            d,
1176            s,
1177            c,
1178            InstCost {
1179                latency: 2,
1180                decode_slots: 2,
1181                alu_slots: 1,
1182                ..EMPTY_COST
1183            },
1184        )
1185    }
1186
1187    fn dispatch_cmov_imm(&mut self, d: RawReg, c: RawReg) {
1188        self.dispatch_2op(
1189            d,
1190            c,
1191            InstCost {
1192                latency: 2,
1193                decode_slots: 3,
1194                alu_slots: 1,
1195                ..EMPTY_COST
1196            },
1197        )
1198    }
1199
1200    fn dispatch_min_max(&mut self, d: RawReg, s1: RawReg, s2: RawReg) {
1201        self.dispatch_3op(
1202            d,
1203            s1,
1204            s2,
1205            InstCost {
1206                latency: 3,
1207                decode_slots: 2 + u32::from((d.get() != s1.get()) & (d.get() != s2.get())),
1208                alu_slots: 1,
1209                ..EMPTY_COST
1210            },
1211        )
1212    }
1213
1214    fn dispatch_division(&mut self, d: RawReg, s1: RawReg, s2: RawReg) {
1215        self.dispatch_3op(
1216            d,
1217            s1,
1218            s2,
1219            InstCost {
1220                latency: 60,
1221                decode_slots: 4,
1222                alu_slots: 1,
1223                div_slots: 1,
1224                ..EMPTY_COST
1225            },
1226        )
1227    }
1228}
1229
1230impl<'a, B, T> GasVisitorT for Simulator<'a, B, T>
1231where
1232    B: BitnessT,
1233    T: Tracer,
1234{
1235    #[inline]
1236    fn take_block_cost(&mut self) -> Option<u32> {
1237        if (self.instructions_in_flight() == 0) & self.finished {
1238            let cycles = self.cycles;
1239            self.clear();
1240
1241            let cycles = cast((cast(cycles).to_signed() - GAS_COST_SLACK).max(1)).to_unsigned();
1242            Some(cycles)
1243        } else {
1244            None
1245        }
1246    }
1247
1248    fn is_at_start_of_basic_block(&self) -> bool {
1249        self.instructions == 0
1250    }
1251}
1252
1253impl<'a, B, T> ParsingVisitor for Simulator<'a, B, T>
1254where
1255    B: BitnessT,
1256    T: Tracer,
1257{
1258    type ReturnTy = ();
1259
1260    // Simple ALU instructions (3 op)
1261
1262    #[inline(always)]
1263    fn xor(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1264        self.dispatch_simple_alu_3op(d, s1, s2)
1265    }
1266
1267    #[inline(always)]
1268    fn and(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1269        self.dispatch_simple_alu_3op(d, s1, s2)
1270    }
1271
1272    #[inline(always)]
1273    fn or(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1274        self.dispatch_simple_alu_3op(d, s1, s2)
1275    }
1276
1277    #[inline(always)]
1278    fn add_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1279        self.dispatch_simple_alu_3op(d, s1, s2)
1280    }
1281
1282    #[inline(always)]
1283    fn sub_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1284        self.dispatch_simple_alu_3op(d, s1, s2)
1285    }
1286
1287    // Simple ALU instructions (3 op), 32-bit
1288
1289    #[inline(always)]
1290    fn add_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1291        self.dispatch_simple_alu_3op_32(d, s1, s2)
1292    }
1293
1294    #[inline(always)]
1295    fn sub_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1296        self.dispatch_simple_alu_3op_32(d, s1, s2)
1297    }
1298
1299    // Simple ALU instructions (2 op)
1300
1301    #[inline(always)]
1302    fn xor_imm(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg, _imm: u32) -> Self::ReturnTy {
1303        self.dispatch_simple_alu_2op(d, s)
1304    }
1305
1306    #[inline(always)]
1307    fn and_imm(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg, _imm: u32) -> Self::ReturnTy {
1308        self.dispatch_simple_alu_2op(d, s)
1309    }
1310
1311    #[inline(always)]
1312    fn or_imm(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg, _imm: u32) -> Self::ReturnTy {
1313        self.dispatch_simple_alu_2op(d, s)
1314    }
1315
1316    #[inline(always)]
1317    fn add_imm_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg, _imm: u32) -> Self::ReturnTy {
1318        // TODO: in 'd != s' case we use a single `lea`, see if modeling that makes sense
1319        self.dispatch_simple_alu_2op(d, s)
1320    }
1321
1322    #[inline(always)]
1323    fn shift_logical_right_imm_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1324        self.dispatch_simple_alu_2op(d, s1)
1325    }
1326
1327    #[inline(always)]
1328    fn shift_arithmetic_right_imm_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1329        self.dispatch_simple_alu_2op(d, s1)
1330    }
1331
1332    #[inline(always)]
1333    fn shift_logical_left_imm_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1334        self.dispatch_simple_alu_2op(d, s1)
1335    }
1336
1337    #[inline(always)]
1338    fn rotate_right_imm_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _c: u32) -> Self::ReturnTy {
1339        self.dispatch_simple_alu_2op(d, s1)
1340    }
1341
1342    #[inline(always)]
1343    fn reverse_byte(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg) -> Self::ReturnTy {
1344        self.dispatch_simple_alu_2op(d, s)
1345    }
1346
1347    // Simple ALU instructions (2 op), 32-bit
1348
1349    #[inline(always)]
1350    fn add_imm_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg, _imm: u32) -> Self::ReturnTy {
1351        // TODO: in 'd != s' case we use a single `lea`, see if modeling that makes sense
1352        self.dispatch_simple_alu_2op_32bit(d, s)
1353    }
1354
1355    #[inline(always)]
1356    fn shift_logical_right_imm_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1357        self.dispatch_simple_alu_2op_32bit(d, s1)
1358    }
1359
1360    #[inline(always)]
1361    fn shift_arithmetic_right_imm_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1362        self.dispatch_simple_alu_2op_32bit(d, s1)
1363    }
1364
1365    #[inline(always)]
1366    fn shift_logical_left_imm_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1367        self.dispatch_simple_alu_2op_32bit(d, s1)
1368    }
1369
1370    #[inline(always)]
1371    fn rotate_right_imm_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _c: u32) -> Self::ReturnTy {
1372        self.dispatch_simple_alu_2op_32bit(d, s1)
1373    }
1374
1375    // Trivial (2 op, 1 cycle)
1376
1377    #[inline(always)]
1378    fn count_leading_zero_bits_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg) -> Self::ReturnTy {
1379        self.dispatch_trivial_2op_1c(d, s)
1380    }
1381
1382    #[inline(always)]
1383    fn count_leading_zero_bits_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg) -> Self::ReturnTy {
1384        self.dispatch_trivial_2op_1c(d, s)
1385    }
1386
1387    #[inline(always)]
1388    fn count_set_bits_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg) -> Self::ReturnTy {
1389        self.dispatch_trivial_2op_1c(d, s)
1390    }
1391
1392    #[inline(always)]
1393    fn count_set_bits_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg) -> Self::ReturnTy {
1394        self.dispatch_trivial_2op_1c(d, s)
1395    }
1396
1397    #[inline(always)]
1398    fn sign_extend_8(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg) -> Self::ReturnTy {
1399        self.dispatch_trivial_2op_1c(d, s)
1400    }
1401
1402    #[inline(always)]
1403    fn sign_extend_16(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg) -> Self::ReturnTy {
1404        self.dispatch_trivial_2op_1c(d, s)
1405    }
1406
1407    #[inline(always)]
1408    fn zero_extend_16(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg) -> Self::ReturnTy {
1409        self.dispatch_trivial_2op_1c(d, s)
1410    }
1411
1412    // Trivial (2 op, 2 cycles)
1413
1414    #[inline(always)]
1415    fn count_trailing_zero_bits_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg) -> Self::ReturnTy {
1416        self.dispatch_trivial_2op_2c(d, s)
1417    }
1418
1419    #[inline(always)]
1420    fn count_trailing_zero_bits_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg) -> Self::ReturnTy {
1421        self.dispatch_trivial_2op_2c(d, s)
1422    }
1423
1424    // Shifts and rotates, 64-bit
1425
1426    #[inline(always)]
1427    fn shift_logical_right_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1428        self.dispatch_shift(d, s1, s2)
1429    }
1430
1431    #[inline(always)]
1432    fn shift_arithmetic_right_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1433        self.dispatch_shift(d, s1, s2)
1434    }
1435
1436    #[inline(always)]
1437    fn shift_logical_left_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1438        self.dispatch_shift(d, s1, s2)
1439    }
1440
1441    #[inline(always)]
1442    fn rotate_left_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1443        self.dispatch_shift(d, s1, s2)
1444    }
1445
1446    #[inline(always)]
1447    fn rotate_right_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1448        self.dispatch_shift(d, s1, s2)
1449    }
1450
1451    // Shifts and rotates, 32-bit
1452
1453    #[inline(always)]
1454    fn shift_logical_right_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1455        self.dispatch_shift_32(d, s1, s2)
1456    }
1457
1458    #[inline(always)]
1459    fn shift_arithmetic_right_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1460        self.dispatch_shift_32(d, s1, s2)
1461    }
1462
1463    #[inline(always)]
1464    fn shift_logical_left_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1465        self.dispatch_shift_32(d, s1, s2)
1466    }
1467
1468    #[inline(always)]
1469    fn rotate_left_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1470        self.dispatch_shift_32(d, s1, s2)
1471    }
1472
1473    #[inline(always)]
1474    fn rotate_right_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1475        self.dispatch_shift_32(d, s1, s2)
1476    }
1477
1478    // Shifts and rotates, alt
1479
1480    #[inline(always)]
1481    fn shift_logical_right_imm_alt_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s2: RawReg, _s1: u32) -> Self::ReturnTy {
1482        self.dispatch_shift_imm_alt(d, s2)
1483    }
1484
1485    #[inline(always)]
1486    fn shift_arithmetic_right_imm_alt_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s2: RawReg, _s1: u32) -> Self::ReturnTy {
1487        self.dispatch_shift_imm_alt(d, s2)
1488    }
1489
1490    #[inline(always)]
1491    fn shift_logical_left_imm_alt_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s2: RawReg, _s1: u32) -> Self::ReturnTy {
1492        self.dispatch_shift_imm_alt(d, s2)
1493    }
1494
1495    #[inline(always)]
1496    fn rotate_right_imm_alt_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg, _c: u32) -> Self::ReturnTy {
1497        self.dispatch_shift_imm_alt(d, s)
1498    }
1499
1500    // Shifts and rotates, alt (32-bit)
1501
1502    #[inline(always)]
1503    fn shift_logical_right_imm_alt_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s2: RawReg, _s1: u32) -> Self::ReturnTy {
1504        self.dispatch_shift_imm_alt_32(d, s2)
1505    }
1506
1507    #[inline(always)]
1508    fn shift_arithmetic_right_imm_alt_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s2: RawReg, _s1: u32) -> Self::ReturnTy {
1509        self.dispatch_shift_imm_alt_32(d, s2)
1510    }
1511
1512    #[inline(always)]
1513    fn shift_logical_left_imm_alt_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s2: RawReg, _s1: u32) -> Self::ReturnTy {
1514        self.dispatch_shift_imm_alt_32(d, s2)
1515    }
1516
1517    #[inline(always)]
1518    fn rotate_right_imm_alt_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg, _c: u32) -> Self::ReturnTy {
1519        self.dispatch_shift_imm_alt_32(d, s)
1520    }
1521
1522    // Register comparisons
1523
1524    #[inline(always)]
1525    fn set_less_than_unsigned(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1526        self.dispatch_compare(d, s1, s2)
1527    }
1528
1529    #[inline(always)]
1530    fn set_less_than_signed(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1531        self.dispatch_compare(d, s1, s2)
1532    }
1533
1534    // Register comparisons (immediate)
1535
1536    #[inline(always)]
1537    fn set_less_than_unsigned_imm(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1538        self.dispatch_compare_imm(d, s1)
1539    }
1540
1541    #[inline(always)]
1542    fn set_less_than_signed_imm(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1543        self.dispatch_compare_imm(d, s1)
1544    }
1545
1546    #[inline(always)]
1547    fn set_greater_than_unsigned_imm(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1548        self.dispatch_compare_imm(d, s1)
1549    }
1550
1551    #[inline(always)]
1552    fn set_greater_than_signed_imm(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1553        self.dispatch_compare_imm(d, s1)
1554    }
1555
1556    // Conditional moves
1557
1558    #[inline(always)]
1559    fn cmov_if_zero(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg, c: RawReg) -> Self::ReturnTy {
1560        self.dispatch_cmov(d, s, c)
1561    }
1562
1563    #[inline(always)]
1564    fn cmov_if_not_zero(&mut self, _offset: u32, _args_length: u32, d: RawReg, s: RawReg, c: RawReg) -> Self::ReturnTy {
1565        self.dispatch_cmov(d, s, c)
1566    }
1567
1568    #[inline(always)]
1569    fn cmov_if_zero_imm(&mut self, _offset: u32, _args_length: u32, d: RawReg, c: RawReg, _s: u32) -> Self::ReturnTy {
1570        self.dispatch_cmov_imm(d, c)
1571    }
1572
1573    #[inline(always)]
1574    fn cmov_if_not_zero_imm(&mut self, _offset: u32, _args_length: u32, d: RawReg, c: RawReg, _s: u32) -> Self::ReturnTy {
1575        self.dispatch_cmov_imm(d, c)
1576    }
1577
1578    // Minimum/maximum
1579
1580    #[inline(always)]
1581    fn maximum(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1582        self.dispatch_min_max(d, s1, s2)
1583    }
1584
1585    #[inline(always)]
1586    fn maximum_unsigned(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1587        self.dispatch_min_max(d, s1, s2)
1588    }
1589
1590    #[inline(always)]
1591    fn minimum(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1592        self.dispatch_min_max(d, s1, s2)
1593    }
1594
1595    #[inline(always)]
1596    fn minimum_unsigned(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1597        self.dispatch_min_max(d, s1, s2)
1598    }
1599
1600    // Indirect loads
1601
1602    #[inline(always)]
1603    fn load_indirect_u8(&mut self, _offset: u32, _args_length: u32, dst: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1604        self.dispatch_indirect_load(dst, base, offset, 1)
1605    }
1606
1607    #[inline(always)]
1608    fn load_indirect_i8(&mut self, _offset: u32, _args_length: u32, dst: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1609        self.dispatch_indirect_load(dst, base, offset, 1)
1610    }
1611
1612    #[inline(always)]
1613    fn load_indirect_u16(&mut self, _offset: u32, _args_length: u32, dst: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1614        self.dispatch_indirect_load(dst, base, offset, 2)
1615    }
1616
1617    #[inline(always)]
1618    fn load_indirect_i16(&mut self, _offset: u32, _args_length: u32, dst: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1619        self.dispatch_indirect_load(dst, base, offset, 2)
1620    }
1621
1622    #[inline(always)]
1623    fn load_indirect_u32(&mut self, _offset: u32, _args_length: u32, dst: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1624        self.dispatch_indirect_load(dst, base, offset, 4)
1625    }
1626
1627    #[inline(always)]
1628    fn load_indirect_i32(&mut self, _offset: u32, _args_length: u32, dst: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1629        self.dispatch_indirect_load(dst, base, offset, 4)
1630    }
1631
1632    #[inline(always)]
1633    fn load_indirect_u64(&mut self, _offset: u32, _args_length: u32, dst: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1634        self.dispatch_indirect_load(dst, base, offset, 8)
1635    }
1636
1637    // Direct loads
1638
1639    #[inline(always)]
1640    fn load_u8(&mut self, _offset: u32, _args_length: u32, dst: RawReg, offset: u32) -> Self::ReturnTy {
1641        self.dispatch_load(dst, offset, 1)
1642    }
1643
1644    #[inline(always)]
1645    fn load_i8(&mut self, _offset: u32, _args_length: u32, dst: RawReg, offset: u32) -> Self::ReturnTy {
1646        self.dispatch_load(dst, offset, 1)
1647    }
1648
1649    #[inline(always)]
1650    fn load_u16(&mut self, _offset: u32, _args_length: u32, dst: RawReg, offset: u32) -> Self::ReturnTy {
1651        self.dispatch_load(dst, offset, 2)
1652    }
1653
1654    #[inline(always)]
1655    fn load_i16(&mut self, _offset: u32, _args_length: u32, dst: RawReg, offset: u32) -> Self::ReturnTy {
1656        self.dispatch_load(dst, offset, 2)
1657    }
1658
1659    #[inline(always)]
1660    fn load_u32(&mut self, _offset: u32, _args_length: u32, dst: RawReg, offset: u32) -> Self::ReturnTy {
1661        self.dispatch_load(dst, offset, 4)
1662    }
1663
1664    #[inline(always)]
1665    fn load_i32(&mut self, _offset: u32, _args_length: u32, dst: RawReg, offset: u32) -> Self::ReturnTy {
1666        self.dispatch_load(dst, offset, 4)
1667    }
1668
1669    #[inline(always)]
1670    fn load_u64(&mut self, _offset: u32, _args_length: u32, dst: RawReg, offset: u32) -> Self::ReturnTy {
1671        self.dispatch_load(dst, offset, 8)
1672    }
1673
1674    // Indirect stores (imm)
1675
1676    #[inline(always)]
1677    fn store_imm_indirect_u8(&mut self, _offset: u32, _args_length: u32, base: RawReg, offset: u32, _value: u32) -> Self::ReturnTy {
1678        self.dispatch_store_imm_indirect(base, offset, 1)
1679    }
1680
1681    #[inline(always)]
1682    fn store_imm_indirect_u16(&mut self, _offset: u32, _args_length: u32, base: RawReg, offset: u32, _value: u32) -> Self::ReturnTy {
1683        self.dispatch_store_imm_indirect(base, offset, 2)
1684    }
1685
1686    #[inline(always)]
1687    fn store_imm_indirect_u32(&mut self, _offset: u32, _args_length: u32, base: RawReg, offset: u32, _value: u32) -> Self::ReturnTy {
1688        self.dispatch_store_imm_indirect(base, offset, 4)
1689    }
1690
1691    #[inline(always)]
1692    fn store_imm_indirect_u64(&mut self, _offset: u32, _args_length: u32, base: RawReg, offset: u32, _value: u32) -> Self::ReturnTy {
1693        self.dispatch_store_imm_indirect(base, offset, 8)
1694    }
1695
1696    // Indirect stores
1697
1698    #[inline(always)]
1699    fn store_indirect_u8(&mut self, _offset: u32, _args_length: u32, src: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1700        self.dispatch_store_indirect(src, base, offset, 1)
1701    }
1702
1703    #[inline(always)]
1704    fn store_indirect_u16(&mut self, _offset: u32, _args_length: u32, src: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1705        self.dispatch_store_indirect(src, base, offset, 2)
1706    }
1707
1708    #[inline(always)]
1709    fn store_indirect_u32(&mut self, _offset: u32, _args_length: u32, src: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1710        self.dispatch_store_indirect(src, base, offset, 4)
1711    }
1712
1713    #[inline(always)]
1714    fn store_indirect_u64(&mut self, _offset: u32, _args_length: u32, src: RawReg, base: RawReg, offset: u32) -> Self::ReturnTy {
1715        self.dispatch_store_indirect(src, base, offset, 8)
1716    }
1717
1718    // Stores (imm)
1719
1720    #[inline(always)]
1721    fn store_imm_u8(&mut self, _offset: u32, _args_length: u32, offset: u32, _value: u32) -> Self::ReturnTy {
1722        self.dispatch_store_imm(offset, 1)
1723    }
1724
1725    #[inline(always)]
1726    fn store_imm_u16(&mut self, _offset: u32, _args_length: u32, offset: u32, _value: u32) -> Self::ReturnTy {
1727        self.dispatch_store_imm(offset, 2)
1728    }
1729
1730    #[inline(always)]
1731    fn store_imm_u32(&mut self, _offset: u32, _args_length: u32, offset: u32, _value: u32) -> Self::ReturnTy {
1732        self.dispatch_store_imm(offset, 4)
1733    }
1734
1735    #[inline(always)]
1736    fn store_imm_u64(&mut self, _offset: u32, _args_length: u32, offset: u32, _value: u32) -> Self::ReturnTy {
1737        self.dispatch_store_imm(offset, 8)
1738    }
1739
1740    // Stores
1741
1742    #[inline(always)]
1743    fn store_u8(&mut self, _offset: u32, _args_length: u32, src: RawReg, offset: u32) -> Self::ReturnTy {
1744        self.dispatch_store(src, offset, 1)
1745    }
1746
1747    #[inline(always)]
1748    fn store_u16(&mut self, _offset: u32, _args_length: u32, src: RawReg, offset: u32) -> Self::ReturnTy {
1749        self.dispatch_store(src, offset, 2)
1750    }
1751
1752    #[inline(always)]
1753    fn store_u32(&mut self, _offset: u32, _args_length: u32, src: RawReg, offset: u32) -> Self::ReturnTy {
1754        self.dispatch_store(src, offset, 4)
1755    }
1756
1757    #[inline(always)]
1758    fn store_u64(&mut self, _offset: u32, _args_length: u32, src: RawReg, offset: u32) -> Self::ReturnTy {
1759        self.dispatch_store(src, offset, 8)
1760    }
1761
1762    // Branches
1763
1764    #[inline(always)]
1765    fn branch_less_unsigned(&mut self, offset: u32, args_length: u32, s1: RawReg, s2: RawReg, imm: u32) -> Self::ReturnTy {
1766        self.dispatch_branch(offset, args_length, s1, s2, imm)
1767    }
1768
1769    #[inline(always)]
1770    fn branch_less_signed(&mut self, offset: u32, args_length: u32, s1: RawReg, s2: RawReg, imm: u32) -> Self::ReturnTy {
1771        self.dispatch_branch(offset, args_length, s1, s2, imm)
1772    }
1773
1774    #[inline(always)]
1775    fn branch_greater_or_equal_unsigned(&mut self, offset: u32, args_length: u32, s1: RawReg, s2: RawReg, imm: u32) -> Self::ReturnTy {
1776        self.dispatch_branch(offset, args_length, s1, s2, imm)
1777    }
1778
1779    #[inline(always)]
1780    fn branch_greater_or_equal_signed(&mut self, offset: u32, args_length: u32, s1: RawReg, s2: RawReg, imm: u32) -> Self::ReturnTy {
1781        self.dispatch_branch(offset, args_length, s1, s2, imm)
1782    }
1783
1784    #[inline(always)]
1785    fn branch_eq(&mut self, offset: u32, args_length: u32, s1: RawReg, s2: RawReg, imm: u32) -> Self::ReturnTy {
1786        self.dispatch_branch(offset, args_length, s1, s2, imm)
1787    }
1788
1789    #[inline(always)]
1790    fn branch_not_eq(&mut self, offset: u32, args_length: u32, s1: RawReg, s2: RawReg, imm: u32) -> Self::ReturnTy {
1791        self.dispatch_branch(offset, args_length, s1, s2, imm)
1792    }
1793
1794    // Branches (with immediate)
1795
1796    #[inline(always)]
1797    fn branch_eq_imm(&mut self, offset: u32, args_length: u32, s1: RawReg, _s2: u32, imm: u32) -> Self::ReturnTy {
1798        self.dispatch_branch_imm(offset, args_length, s1, imm);
1799    }
1800
1801    #[inline(always)]
1802    fn branch_not_eq_imm(&mut self, offset: u32, args_length: u32, s1: RawReg, _s2: u32, imm: u32) -> Self::ReturnTy {
1803        self.dispatch_branch_imm(offset, args_length, s1, imm);
1804    }
1805
1806    #[inline(always)]
1807    fn branch_less_unsigned_imm(&mut self, offset: u32, args_length: u32, s1: RawReg, _s2: u32, imm: u32) -> Self::ReturnTy {
1808        self.dispatch_branch_imm(offset, args_length, s1, imm);
1809    }
1810
1811    #[inline(always)]
1812    fn branch_less_signed_imm(&mut self, offset: u32, args_length: u32, s1: RawReg, _s2: u32, imm: u32) -> Self::ReturnTy {
1813        self.dispatch_branch_imm(offset, args_length, s1, imm);
1814    }
1815
1816    #[inline(always)]
1817    fn branch_greater_or_equal_unsigned_imm(&mut self, offset: u32, args_length: u32, s1: RawReg, _s2: u32, imm: u32) -> Self::ReturnTy {
1818        self.dispatch_branch_imm(offset, args_length, s1, imm);
1819    }
1820
1821    #[inline(always)]
1822    fn branch_greater_or_equal_signed_imm(&mut self, offset: u32, args_length: u32, s1: RawReg, _s2: u32, imm: u32) -> Self::ReturnTy {
1823        self.dispatch_branch_imm(offset, args_length, s1, imm);
1824    }
1825
1826    #[inline(always)]
1827    fn branch_less_or_equal_unsigned_imm(&mut self, offset: u32, args_length: u32, s1: RawReg, _s2: u32, imm: u32) -> Self::ReturnTy {
1828        self.dispatch_branch_imm(offset, args_length, s1, imm);
1829    }
1830
1831    #[inline(always)]
1832    fn branch_less_or_equal_signed_imm(&mut self, offset: u32, args_length: u32, s1: RawReg, _s2: u32, imm: u32) -> Self::ReturnTy {
1833        self.dispatch_branch_imm(offset, args_length, s1, imm);
1834    }
1835
1836    #[inline(always)]
1837    fn branch_greater_unsigned_imm(&mut self, offset: u32, args_length: u32, s1: RawReg, _s2: u32, imm: u32) -> Self::ReturnTy {
1838        self.dispatch_branch_imm(offset, args_length, s1, imm);
1839    }
1840
1841    #[inline(always)]
1842    fn branch_greater_signed_imm(&mut self, offset: u32, args_length: u32, s1: RawReg, _s2: u32, imm: u32) -> Self::ReturnTy {
1843        self.dispatch_branch_imm(offset, args_length, s1, imm);
1844    }
1845
1846    // Division
1847
1848    #[inline(always)]
1849    fn div_unsigned_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1850        self.dispatch_division(d, s1, s2)
1851    }
1852
1853    #[inline(always)]
1854    fn div_signed_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1855        self.dispatch_division(d, s1, s2)
1856    }
1857
1858    #[inline(always)]
1859    fn rem_unsigned_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1860        self.dispatch_division(d, s1, s2)
1861    }
1862
1863    #[inline(always)]
1864    fn rem_signed_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1865        self.dispatch_division(d, s1, s2)
1866    }
1867
1868    #[inline(always)]
1869    fn div_unsigned_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1870        self.dispatch_division(d, s1, s2)
1871    }
1872
1873    #[inline(always)]
1874    fn div_signed_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1875        self.dispatch_division(d, s1, s2)
1876    }
1877
1878    #[inline(always)]
1879    fn rem_unsigned_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1880        self.dispatch_division(d, s1, s2)
1881    }
1882
1883    #[inline(always)]
1884    fn rem_signed_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1885        self.dispatch_division(d, s1, s2)
1886    }
1887
1888    // Misc
1889
1890    #[inline(always)]
1891    fn and_inverted(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1892        // TODO: inaccurate
1893        self.dispatch_3op(
1894            d,
1895            s1,
1896            s2,
1897            InstCost {
1898                latency: 2,
1899                decode_slots: 3,
1900                alu_slots: 1,
1901                ..EMPTY_COST
1902            },
1903        )
1904    }
1905
1906    #[inline(always)]
1907    fn or_inverted(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1908        // TODO: inaccurate
1909        self.dispatch_3op(
1910            d,
1911            s1,
1912            s2,
1913            InstCost {
1914                latency: 2,
1915                decode_slots: 3,
1916                alu_slots: 1,
1917                ..EMPTY_COST
1918            },
1919        )
1920    }
1921
1922    #[inline(always)]
1923    fn xnor(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1924        self.dispatch_3op(
1925            d,
1926            s1,
1927            s2,
1928            InstCost {
1929                latency: 2,
1930                decode_slots: 2 + u32::from((d.get() != s1.get()) & (d.get() != s2.get())),
1931                alu_slots: 1,
1932                ..EMPTY_COST
1933            },
1934        );
1935    }
1936
1937    #[inline(always)]
1938    fn negate_and_add_imm_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1939        self.dispatch_2op(
1940            d,
1941            s1,
1942            InstCost {
1943                latency: 2,
1944                decode_slots: 3,
1945                alu_slots: 1,
1946                ..EMPTY_COST
1947            },
1948        )
1949    }
1950
1951    #[inline(always)]
1952    fn negate_and_add_imm_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
1953        self.dispatch_2op(
1954            d,
1955            s1,
1956            InstCost {
1957                latency: 3,
1958                decode_slots: 4,
1959                alu_slots: 1,
1960                ..EMPTY_COST
1961            },
1962        )
1963    }
1964
1965    #[inline(always)]
1966    fn move_reg(&mut self, _offset: u32, _args_length: u32, dst: RawReg, src: RawReg) -> Self::ReturnTy {
1967        self.dispatch_move_reg_avx2(dst, src);
1968    }
1969
1970    #[inline(always)]
1971    fn load_imm(&mut self, _offset: u32, _args_length: u32, dst: RawReg, _value: u32) -> Self::ReturnTy {
1972        self.dispatch_1op_dst(
1973            dst,
1974            InstCost {
1975                latency: 1,
1976                decode_slots: 1,
1977                ..EMPTY_COST
1978            },
1979        )
1980    }
1981
1982    #[inline(always)]
1983    fn load_imm64(&mut self, _offset: u32, _args_length: u32, dst: RawReg, _value: u64) -> Self::ReturnTy {
1984        self.dispatch_1op_dst(
1985            dst,
1986            InstCost {
1987                latency: 1,
1988                decode_slots: 2,
1989                ..EMPTY_COST
1990            },
1991        );
1992    }
1993
1994    #[inline(always)]
1995    fn mul_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
1996        self.dispatch_3op(
1997            d,
1998            s1,
1999            s2,
2000            InstCost {
2001                latency: 4,
2002                decode_slots: 2 + u32::from((d.get() != s1.get()) & (d.get() != s2.get())),
2003                alu_slots: 1,
2004                mul_slots: 1,
2005                ..EMPTY_COST
2006            },
2007        )
2008    }
2009
2010    #[inline(always)]
2011    fn mul_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
2012        self.dispatch_3op(
2013            d,
2014            s1,
2015            s2,
2016            InstCost {
2017                latency: 3,
2018                decode_slots: 1 + u32::from((d.get() != s1.get()) & (d.get() != s2.get())),
2019                alu_slots: 1,
2020                mul_slots: 1,
2021                ..EMPTY_COST
2022            },
2023        )
2024    }
2025
2026    #[inline(always)]
2027    fn mul_imm_32(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
2028        self.dispatch_2op(
2029            d,
2030            s1,
2031            InstCost {
2032                latency: 4,
2033                decode_slots: 2 + u32::from(d.get() != s1.get()),
2034                alu_slots: 1,
2035                mul_slots: 1,
2036                ..EMPTY_COST
2037            },
2038        )
2039    }
2040
2041    #[inline(always)]
2042    fn mul_imm_64(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, _s2: u32) -> Self::ReturnTy {
2043        self.dispatch_2op(
2044            d,
2045            s1,
2046            InstCost {
2047                latency: 3,
2048                decode_slots: 1 + u32::from(d.get() != s1.get()),
2049                alu_slots: 1,
2050                mul_slots: 1,
2051                ..EMPTY_COST
2052            },
2053        )
2054    }
2055
2056    #[inline(always)]
2057    fn mul_upper_signed_signed(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
2058        self.dispatch_3op(
2059            d,
2060            s1,
2061            s2,
2062            InstCost {
2063                latency: 4,
2064                decode_slots: 4,
2065                alu_slots: 1,
2066                mul_slots: 1,
2067                ..EMPTY_COST
2068            },
2069        )
2070    }
2071
2072    #[inline(always)]
2073    fn mul_upper_unsigned_unsigned(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
2074        self.dispatch_3op(
2075            d,
2076            s1,
2077            s2,
2078            InstCost {
2079                latency: 4,
2080                decode_slots: 4,
2081                alu_slots: 1,
2082                mul_slots: 1,
2083                ..EMPTY_COST
2084            },
2085        )
2086    }
2087
2088    #[inline(always)]
2089    fn mul_upper_signed_unsigned(&mut self, _offset: u32, _args_length: u32, d: RawReg, s1: RawReg, s2: RawReg) -> Self::ReturnTy {
2090        self.dispatch_3op(
2091            d,
2092            s1,
2093            s2,
2094            InstCost {
2095                latency: 6,
2096                decode_slots: 4,
2097                alu_slots: 1,
2098                mul_slots: 1,
2099                ..EMPTY_COST
2100            },
2101        )
2102    }
2103
2104    // End of block instructions
2105
2106    #[cold]
2107    fn invalid(&mut self, _offset: u32, _args_length: u32) -> Self::ReturnTy {
2108        self.dispatch_finish(2);
2109    }
2110
2111    #[inline(always)]
2112    fn trap(&mut self, _offset: u32, _args_length: u32) -> Self::ReturnTy {
2113        self.dispatch_finish(2);
2114    }
2115
2116    #[inline(always)]
2117    fn fallthrough(&mut self, _offset: u32, _args_length: u32) -> Self::ReturnTy {
2118        self.dispatch_finish(2);
2119    }
2120
2121    #[inline(always)]
2122    fn unlikely(&mut self, _offset: u32, _args_length: u32) -> Self::ReturnTy {
2123        self.dispatch_generic(
2124            None,
2125            None,
2126            None,
2127            InstCost {
2128                latency: 40,
2129                decode_slots: 1,
2130                ..EMPTY_COST
2131            },
2132        );
2133    }
2134
2135    #[inline(always)]
2136    fn jump(&mut self, _offset: u32, _args_length: u32, _target: u32) -> Self::ReturnTy {
2137        self.dispatch_finish(15);
2138    }
2139
2140    #[inline(always)]
2141    fn load_imm_and_jump(&mut self, _offset: u32, _args_length: u32, _ra: RawReg, _value: u32, _target: u32) -> Self::ReturnTy {
2142        self.dispatch_finish(15);
2143    }
2144
2145    #[inline(always)]
2146    fn jump_indirect(&mut self, _offset: u32, _args_length: u32, base: RawReg, _base_offset: u32) -> Self::ReturnTy {
2147        self.dispatch_generic(
2148            None,
2149            Some(base),
2150            None,
2151            InstCost {
2152                latency: 22,
2153                decode_slots: 1,
2154                ..EMPTY_COST
2155            },
2156        );
2157        self.wait_until_empty();
2158        self.finished = true;
2159    }
2160
2161    #[inline(always)]
2162    fn load_imm_and_jump_indirect(
2163        &mut self,
2164        _offset: u32,
2165        _args_length: u32,
2166        _ra: RawReg,
2167        base: RawReg,
2168        _value: u32,
2169        _base_offset: u32,
2170    ) -> Self::ReturnTy {
2171        self.dispatch_generic(
2172            None,
2173            Some(base),
2174            None,
2175            InstCost {
2176                latency: 22,
2177                decode_slots: 1,
2178                ..EMPTY_COST
2179            },
2180        );
2181        self.wait_until_empty();
2182        self.finished = true;
2183    }
2184
2185    // Special instructions
2186
2187    #[inline(always)]
2188    fn ecalli(&mut self, _offset: u32, _args_length: u32, _imm: u32) -> Self::ReturnTy {
2189        self.dispatch_generic(
2190            None,
2191            None,
2192            None,
2193            InstCost {
2194                latency: 100,
2195                decode_slots: 4,
2196                alu_slots: 1,
2197                ..EMPTY_COST
2198            },
2199        );
2200    }
2201
2202    #[inline(always)]
2203    fn sbrk(&mut self, _offset: u32, _args_length: u32, dst: RawReg, src: RawReg) -> Self::ReturnTy {
2204        // TODO: YOLO assigned
2205        self.dispatch_2op(
2206            dst,
2207            src,
2208            InstCost {
2209                latency: 100,
2210                decode_slots: 4,
2211                alu_slots: 1,
2212                ..EMPTY_COST
2213            },
2214        );
2215    }
2216
2217    #[inline(always)]
2218    fn memset(&mut self, _offset: u32, _args_length: u32) -> Self::ReturnTy {
2219        // TODO: YOLO assigned
2220        self.dispatch_generic(
2221            None,
2222            None,
2223            None,
2224            InstCost {
2225                latency: 100,
2226                decode_slots: 4,
2227                alu_slots: 1,
2228                ..EMPTY_COST
2229            },
2230        )
2231    }
2232}
2233
2234#[derive(Clone)]
2235#[non_exhaustive]
2236pub struct TimelineConfig<'a> {
2237    pub should_enable_fast_forward: bool,
2238    pub instruction_format: InstructionFormat<'a>,
2239}
2240
2241impl<'a> Default for TimelineConfig<'a> {
2242    fn default() -> Self {
2243        TimelineConfig {
2244            should_enable_fast_forward: false,
2245            instruction_format: InstructionFormat {
2246                is_64_bit: true,
2247                ..InstructionFormat::default()
2248            },
2249        }
2250    }
2251}
2252
2253pub fn timeline_for_instructions(
2254    code: &[u8],
2255    isa: InstructionSetKind,
2256    cache_model: CacheModel,
2257    instructions: &[crate::program::ParsedInstruction],
2258    config: TimelineConfig,
2259) -> (String, u32) {
2260    use alloc::collections::BTreeMap;
2261
2262    struct TimelineTracer<'a> {
2263        should_enable_fast_forward: bool,
2264        timeline: &'a mut BTreeMap<(u32, u32), EventKind>,
2265    }
2266
2267    impl<'a> Tracer for TimelineTracer<'a> {
2268        const SHOULD_CALL_ON_EVENT: bool = true;
2269
2270        fn should_enable_fast_forward(&self) -> bool {
2271            self.should_enable_fast_forward
2272        }
2273
2274        fn on_event(&mut self, cycle: u32, instruction: u32, event: EventKind) {
2275            match self.timeline.entry((cycle, instruction)) {
2276                alloc::collections::btree_map::Entry::Vacant(entry) => {
2277                    #[cfg(all(test, feature = "logging"))]
2278                    log::debug!(
2279                        "on_event[{cycle}]: instruction={instruction} '{}' (event={event:?})",
2280                        char::from(event)
2281                    );
2282                    entry.insert(event);
2283                }
2284                alloc::collections::btree_map::Entry::Occupied(entry) => {
2285                    panic!(
2286                        "duplicate timeline update: cycle={cycle} instruction={instruction} old_event={:?} new_event={event:?}",
2287                        entry.get()
2288                    );
2289                }
2290            }
2291        }
2292    }
2293
2294    let count = instructions
2295        .iter()
2296        .take_while(|inst| !inst.kind.opcode().starts_new_basic_block())
2297        .count();
2298
2299    let mut instructions = instructions[..(count + 1).min(instructions.len())].to_vec();
2300    if !instructions
2301        .last()
2302        .map(|instruction| instruction.kind.opcode().starts_new_basic_block())
2303        .unwrap_or(false)
2304    {
2305        let next_pc = instructions.last().map(|instruction| instruction.next_offset.0).unwrap_or(0);
2306        instructions.push(crate::program::ParsedInstruction {
2307            kind: crate::program::Instruction::invalid,
2308            offset: crate::program::ProgramCounter(next_pc),
2309            next_offset: crate::program::ProgramCounter(next_pc + 1),
2310        });
2311    }
2312
2313    let mut timeline_map = BTreeMap::new();
2314    let mut sim = Simulator::<B64, _>::new(
2315        code,
2316        isa,
2317        cache_model,
2318        TimelineTracer {
2319            should_enable_fast_forward: config.should_enable_fast_forward,
2320            timeline: &mut timeline_map,
2321        },
2322    );
2323
2324    for &instruction in &instructions {
2325        assert!(sim.take_block_cost().is_none());
2326        instruction.visit_parsing(&mut sim);
2327    }
2328
2329    let total_cycles = cast(sim.cycles).to_usize();
2330    let block_cost = sim.take_block_cost().unwrap();
2331    #[cfg(all(test, feature = "logging"))]
2332    log::debug!("Total cycles: {total_cycles}");
2333
2334    #[cfg(all(test, feature = "logging"))]
2335    log::debug!("Block cost: {block_cost}");
2336
2337    let mut timeline = vec!['.'; total_cycles * instructions.len()];
2338    for ((cycle, instruction), event) in timeline_map {
2339        let index = instruction as usize * total_cycles + cycle as usize;
2340        timeline[index] = char::from(event);
2341    }
2342
2343    let mut timeline_s = String::new();
2344    for (nth_instruction, instruction) in instructions.iter().enumerate() {
2345        use core::fmt::Write;
2346
2347        let line = &timeline[nth_instruction * total_cycles..(nth_instruction + 1) * total_cycles];
2348        timeline_s.extend(line.iter().copied());
2349        timeline_s.push_str("  ");
2350        writeln!(&mut timeline_s, "{}", instruction.display(&config.instruction_format)).unwrap();
2351    }
2352
2353    if config.should_enable_fast_forward {
2354        let mut timeline_new = String::with_capacity(timeline_s.len());
2355        let mut is_in_cycles = true;
2356        let mut last = '.';
2357        for mut ch in timeline_s.chars() {
2358            if ch == ' ' {
2359                is_in_cycles = false;
2360            } else if ch == '\n' {
2361                is_in_cycles = true;
2362                last = '.';
2363            } else if ch == '.' {
2364                if last != 'R' && last != 'D' && is_in_cycles {
2365                    ch = last;
2366                }
2367            } else {
2368                last = ch;
2369            }
2370            timeline_new.push(ch);
2371        }
2372        timeline_s = timeline_new;
2373    }
2374
2375    (timeline_s, block_cost)
2376}
2377
2378pub fn trap_cost(isa: InstructionSetKind, cache_model: CacheModel) -> u32 {
2379    let mut sim = Simulator::<B64, _>::new(&[], isa, cache_model, ());
2380    crate::program::ParsedInstruction {
2381        kind: crate::program::Instruction::trap,
2382        offset: crate::program::ProgramCounter(0),
2383        next_offset: crate::program::ProgramCounter(0),
2384    }
2385    .visit_parsing(&mut sim);
2386    sim.take_block_cost().unwrap()
2387}
2388
2389#[cfg(test)]
2390mod tests {
2391    use alloc::string::String;
2392    use alloc::vec::Vec;
2393
2394    use super::{timeline_for_instructions, CacheModel, TimelineConfig};
2395    use crate::assembler::assemble;
2396    use crate::program::{InstructionSetKind, ProgramBlob};
2397
2398    #[cfg(test)]
2399    fn test_config() -> CacheModel {
2400        CacheModel::L1Hit
2401    }
2402
2403    #[cfg(test)]
2404    fn assert_timeline(config: CacheModel, program: &str, expected_timeline: &str) {
2405        use crate::cast::cast;
2406
2407        let _ = env_logger::try_init();
2408
2409        let program = assemble(Some(InstructionSetKind::Latest64), program).unwrap();
2410        let blob = ProgramBlob::parse(program.into()).unwrap();
2411        let instructions: Vec<_> = blob.instructions().collect();
2412
2413        let (timeline_s, cycles) = timeline_for_instructions(
2414            blob.code(),
2415            InstructionSetKind::Latest64,
2416            config,
2417            &instructions,
2418            TimelineConfig::default(),
2419        );
2420        let mut expected_timeline_s = String::new();
2421        let mut expected_cycles = 0;
2422        for line in expected_timeline.lines() {
2423            let line = line.trim();
2424            if line.is_empty() {
2425                continue;
2426            }
2427            expected_timeline_s.push_str(line);
2428            expected_timeline_s.push('\n');
2429
2430            expected_cycles = expected_cycles.max(line.split("  ").next().unwrap().len() as u32);
2431        }
2432
2433        if timeline_s != expected_timeline_s {
2434            panic!("Timeline mismatch!\n\nExpected timeline:\n{expected_timeline_s}\nActual timeline:\n{timeline_s}");
2435        }
2436
2437        let expected_cycles = cast(expected_cycles).to_signed() - 3;
2438        assert_eq!(cast(cycles).to_signed(), expected_cycles);
2439
2440        #[cfg(feature = "logging")]
2441        log::debug!("Rerunning with fast-forward enabled...");
2442
2443        let timeline_config = TimelineConfig {
2444            should_enable_fast_forward: true,
2445            ..TimelineConfig::default()
2446        };
2447        let (timeline_ff_s, cycles_ff) =
2448            timeline_for_instructions(blob.code(), InstructionSetKind::Latest64, config, &instructions, timeline_config);
2449        assert_eq!(cycles_ff, cycles);
2450        if timeline_ff_s != expected_timeline_s {
2451            panic!("Timeline mismatch for fast-forward!\n\nExpected timeline:\n{expected_timeline_s}\nActual timeline:\n{timeline_ff_s}");
2452        }
2453    }
2454
2455    #[test]
2456    fn test_parallel_simple() {
2457        assert_timeline(
2458            test_config(),
2459            "
2460                a0 = a1 + a2
2461                a1 = a1 + a2
2462                trap
2463            ",
2464            "
2465                DeER.  a0 = a1 + a2
2466                DeER.  a1 = a1 + a2
2467                DeeER  trap
2468            ",
2469        );
2470    }
2471
2472    #[test]
2473    fn test_sequential_simple() {
2474        assert_timeline(
2475            test_config(),
2476            "
2477                a0 = a1 + a2
2478                a1 = a0 + a2
2479                trap
2480            ",
2481            "
2482                DeER..  a0 = a1 + a2
2483                D=eER.  a1 = a0 + a2
2484                .DeeER  trap
2485            ",
2486        );
2487    }
2488
2489    #[test]
2490    fn test_sequential_decode_limits() {
2491        assert_timeline(
2492            test_config(),
2493            "
2494                a0 = 0x12345678aabbccdd
2495                a1 = 0x12345678aabbccdd
2496                a2 = 0x12345678aabbccdd
2497                a3 = 0x12345678aabbccdd
2498                trap
2499            ",
2500            "
2501                DeER...  a0 = 0x12345678aabbccdd
2502                DeER...  a1 = 0x12345678aabbccdd
2503                .DeER..  a2 = 0x12345678aabbccdd
2504                .DeER..  a3 = 0x12345678aabbccdd
2505                ..DeeER  trap
2506            ",
2507        );
2508    }
2509
2510    #[test]
2511    fn test_resource_limits_mul() {
2512        assert_timeline(
2513            test_config(),
2514            "
2515                a0 = a1 * a2
2516                a1 = a3 * a4
2517                trap
2518            ",
2519            "
2520                DeeeER...  a0 = a1 * a2
2521                D===eeeER  a1 = a3 * a4
2522                .DeeE---R  trap
2523            ",
2524        );
2525    }
2526
2527    #[test]
2528    fn test_mul_with_dep() {
2529        assert_timeline(
2530            test_config(),
2531            "
2532                a0 = a1 + a2
2533                a4 = a0 * a3
2534                trap
2535            ",
2536            "
2537                DeER...  a0 = a1 + a2
2538                D=eeeER  a4 = a0 * a3
2539                .DeeE-R  trap
2540            ",
2541        );
2542    }
2543
2544    #[test]
2545    fn test_register_move() {
2546        assert_timeline(
2547            test_config(),
2548            "
2549                s0 = 1
2550                a0 = s0
2551                a1 = a0 + 1
2552                trap
2553            ",
2554            "
2555                DeER..  s0 = 0x1
2556                D.....  a0 = s0
2557                D=eER.  a1 = a0 + 0x1
2558                .DeeER  trap
2559            ",
2560        )
2561    }
2562
2563    #[test]
2564    fn test_memory_accesses() {
2565        assert_timeline(
2566            test_config(),
2567            "
2568                a0 = s1
2569                ra = u64 [sp + 0x30]
2570                s0 = u64 [sp + 0x28]
2571                s1 = u64 [sp + 0x20]
2572                sp = sp + 0x38
2573                ret
2574            ",
2575            "
2576                D............................  a0 = s1
2577                DeeeeER......................  ra = u64 [sp + 0x30]
2578                DeeeeER......................  s0 = u64 [sp + 0x28]
2579                DeeeeER......................  s1 = u64 [sp + 0x20]
2580                .DeE--R......................  sp = sp + 0x38
2581                .D===eeeeeeeeeeeeeeeeeeeeeeER  ret
2582            ",
2583        )
2584    }
2585
2586    #[test]
2587    fn test_empty() {
2588        assert_timeline(
2589            test_config(),
2590            "
2591                fallthrough
2592            ",
2593            "
2594                DeeER  fallthrough
2595            ",
2596        );
2597    }
2598
2599    #[test]
2600    fn test_overwrite_register() {
2601        assert_timeline(
2602            test_config(),
2603            "
2604                s0 = u64 [sp]
2605                s0 = a1 + a2
2606                s0 = u64 [s0]
2607                jump [s0]
2608            ",
2609            "
2610                DeeeeER.......................  s0 = u64 [sp]
2611                DeE---R.......................  s0 = a1 + a2
2612                D=eeeeER......................  s0 = u64 [s0]
2613                .D====eeeeeeeeeeeeeeeeeeeeeeER  jump [s0]
2614            ",
2615        );
2616    }
2617
2618    #[test]
2619    fn test_load_and_jump() {
2620        assert_timeline(
2621            test_config(),
2622            "
2623                @0:
2624                a2 = u8 [a0 + 11]
2625                jump @0 if a2 == 0
2626            ",
2627            "
2628                DeeeeER.  a2 = u8 [a0 + 0xb]
2629                D====eER  jump 0 if a2 == 0
2630            ",
2631        );
2632    }
2633
2634    #[test]
2635    fn test_complex() {
2636        assert_timeline(
2637            test_config(),
2638            "
2639                a2 = i16 [a0 + 0x6]
2640                a1 = a1 & 0x7
2641                a3 = 0x1
2642                a1 = a1 << 0x8
2643                a2 = a2 & 0xfffffffffffff8ff
2644                a1 = a1 | a2
2645                a2 = a1 + a3
2646                u8 [a0 + 0x2] = a3
2647                trap
2648            ",
2649            "
2650                DeeeeER.......................  a2 = i16 [a0 + 0x6]
2651                DeE---R.......................  a1 = a1 & 0x7
2652                DeE---R.......................  a3 = 0x1
2653                D=eE--R.......................  a1 = a1 << 0x8
2654                .D===eER......................  a2 = a2 & 0xfffffffffffff8ff
2655                .D====eER.....................  a1 = a1 | a2
2656                .D=====eER....................  a2 = a1 + a3
2657                ..DeeeeeeeeeeeeeeeeeeeeeeeeeER  u8 [a0 + 0x2] = a3
2658                ..DeeE-----------------------R  trap
2659            ",
2660        );
2661    }
2662
2663    #[test]
2664    fn test_even_more_complex() {
2665        assert_timeline(
2666            test_config(),
2667            "
2668                @0:
2669                i32 a1 = clz a0
2670                i32 a0 = a0 << a1
2671                a1 = a1 << 0x17
2672                i32 a2 = a0 >> 0x8
2673                a3 = a0 >> 0x7
2674                a3 = a3 & ~a2
2675                i32 a2 = a2 - a1
2676                a0 = a0 << 0x18
2677                a3 = a3 & 0x1
2678                i32 a0 = a0 - a3
2679                i32 a0 = a0 >> 0x1f
2680                a1 = a2 + 0x4e800000
2681                i32 a0 = a0 + a1
2682                a1 = 0x46008c00
2683                ra = 0x24
2684                jump @0
2685            ",
2686            "
2687                DeER.....................  i32 a1 = clz a0
2688                D=eeER...................  i32 a0 = a0 << a1
2689                .DeE-R...................  a1 = a1 << 0x17
2690                .D==eeER.................  i32 a2 = a0 >> 0x8
2691                ..D=eE-R.................  a3 = a0 >> 0x7
2692                ...D==eeER...............  a3 = a3 & ~a2
2693                ....D=eeER...............  i32 a2 = a2 - a1
2694                ....DeE--R...............  a0 = a0 << 0x18
2695                ....D===eER..............  a3 = a3 & 0x1
2696                .....D===eeER............  i32 a0 = a0 - a3
2697                .....D=====eeER..........  i32 a0 = a0 >> 0x1f
2698                ......D=eE----R..........  a1 = a2 + 0x4e800000
2699                ......D======eeER........  i32 a0 = a0 + a1
2700                .......DeE------R........  a1 = 0x46008c00
2701                .......DeE------R........  ra = 0x24
2702                .......DeeeeeeeeeeeeeeeER  jump 0
2703            ",
2704        );
2705    }
2706
2707    #[test]
2708    fn test_super_complex_l1() {
2709        assert_timeline(
2710            CacheModel::L1Hit,
2711            "
2712                @0:
2713                unlikely
2714                t1 = u8 [s0]
2715                a1 = u8 [s0 + 0x11]
2716                a2 = 0x172d0
2717                a3 = u8 [s0 + 0x16]
2718                t0 = sp + 0x58
2719                a1 = a1 << 0x3
2720                a1 = a1 + a2
2721                a2 = u8 [a1]
2722                a5 = u8 [a1 + 0x1]
2723                s1 = u8 [a1 + 0x2]
2724                a4 = u8 [a1 + 0x3]
2725                a3 = a3 + t0
2726                a5 = a5 << 0x8
2727                s1 = s1 << 0x10
2728                a4 = a4 << 0x18
2729                a2 = a2 | a5
2730                a5 = u8 [a1 + 0x4]
2731                a0 = u8 [a1 + 0x5]
2732                a4 = a4 | s1
2733                s1 = u8 [a1 + 0x6]
2734                a1 = u8 [a1 + 0x7]
2735                a0 = a0 << 0x8
2736                a0 = a0 | a5
2737                s1 = s1 << 0x10
2738                a1 = a1 << 0x18
2739                a1 = a1 | s1
2740                a2 = a2 | a4
2741                a0 = a0 | a1
2742                a1 = s0 - t1
2743                a0 = a0 << 0x20
2744                a0 = a0 | a2
2745                u64 [sp + 0x58] = a0
2746                a0 = u8 [a3]
2747                a1 = u8 [a1 + 0x4]
2748                a0 = a1 * a0
2749                a1 = u8 [s0 + 0x23]
2750                jump @0 if a1 != 0
2751            ",
2752            "
2753                DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER............................  unlikely
2754                DeeeeE------------------------------------R............................  t1 = u8 [s0]
2755                DeeeeE------------------------------------R............................  a1 = u8 [s0 + 0x11]
2756                DeE---------------------------------------R............................  a2 = 0x172d0
2757                .DeeeeE-----------------------------------R............................  a3 = u8 [s0 + 0x16]
2758                .DeE--------------------------------------R............................  t0 = sp + 0x58
2759                .D===eE-----------------------------------R............................  a1 = a1 << 0x3
2760                ..D===eE----------------------------------R............................  a1 = a1 + a2
2761                ..D====eeeeE------------------------------R............................  a2 = u8 [a1]
2762                ..D====eeeeE------------------------------R............................  a5 = u8 [a1 + 0x1]
2763                ..D====eeeeE------------------------------R............................  s1 = u8 [a1 + 0x2]
2764                ...D===eeeeE------------------------------R............................  a4 = u8 [a1 + 0x3]
2765                ...D==eE----------------------------------R............................  a3 = a3 + t0
2766                ...D=======eE-----------------------------R............................  a5 = a5 << 0x8
2767                ...D=======eE-----------------------------R............................  s1 = s1 << 0x10
2768                ....D======eE-----------------------------R............................  a4 = a4 << 0x18
2769                ....D=======eE----------------------------R............................  a2 = a2 | a5
2770                ....D======eeeeE--------------------------R............................  a5 = u8 [a1 + 0x4]
2771                ....D=======eeeeE-------------------------R............................  a0 = u8 [a1 + 0x5]
2772                .....D======eE----------------------------R............................  a4 = a4 | s1
2773                .....D=======eeeeE------------------------R............................  s1 = u8 [a1 + 0x6]
2774                .....D=======eeeeE------------------------R............................  a1 = u8 [a1 + 0x7]
2775                .....D==========eE------------------------R............................  a0 = a0 << 0x8
2776                ......D==========eE-----------------------R............................  a0 = a0 | a5
2777                ......D==========eE-----------------------R............................  s1 = s1 << 0x10
2778                ......D==========eE-----------------------R............................  a1 = a1 << 0x18
2779                ......D===========eE----------------------R............................  a1 = a1 | s1
2780                .......D=======eE-------------------------R............................  a2 = a2 | a4
2781                .......D===========eE---------------------R............................  a0 = a0 | a1
2782                .......D========eE------------------------R............................  a1 = s0 - t1
2783                ........D===========eE--------------------R............................  a0 = a0 << 0x20
2784                ........D============eE-------------------R............................  a0 = a0 | a2
2785                ...........................................DeeeeeeeeeeeeeeeeeeeeeeeeeER  u64 [sp + 0x58] = a0
2786                ...........................................DeeeeE---------------------R  a0 = u8 [a3]
2787                ...........................................DeeeeE---------------------R  a1 = u8 [a1 + 0x4]
2788                ...........................................D====eeeE------------------R  a0 = a1 * a0
2789                ............................................DeeeeE--------------------R  a1 = u8 [s0 + 0x23]
2790                ............................................D====eE-------------------R  jump 0 if a1 != 0
2791            ",
2792        );
2793    }
2794
2795    #[test]
2796    fn test_super_complex_l2() {
2797        assert_timeline(
2798            CacheModel::L2Hit,
2799            "
2800                @0:
2801                unlikely
2802                t1 = u8 [s0]
2803                a1 = u8 [s0 + 0x11]
2804                a2 = 0x172d0
2805                a3 = u8 [s0 + 0x16]
2806                t0 = sp + 0x58
2807                a1 = a1 << 0x3
2808                a1 = a1 + a2
2809                a2 = u8 [a1]
2810                a5 = u8 [a1 + 0x1]
2811                s1 = u8 [a1 + 0x2]
2812                a4 = u8 [a1 + 0x3]
2813                a3 = a3 + t0
2814                a5 = a5 << 0x8
2815                s1 = s1 << 0x10
2816                a4 = a4 << 0x18
2817                a2 = a2 | a5
2818                a5 = u8 [a1 + 0x4]
2819                a0 = u8 [a1 + 0x5]
2820                a4 = a4 | s1
2821                s1 = u8 [a1 + 0x6]
2822                a1 = u8 [a1 + 0x7]
2823                a0 = a0 << 0x8
2824                a0 = a0 | a5
2825                s1 = s1 << 0x10
2826                a1 = a1 << 0x18
2827                a1 = a1 | s1
2828                a2 = a2 | a4
2829                a0 = a0 | a1
2830                a1 = s0 - t1
2831                a0 = a0 << 0x20
2832                a0 = a0 | a2
2833                u64 [sp + 0x58] = a0
2834                a0 = u8 [a3]
2835                a1 = u8 [a1 + 0x4]
2836                a0 = a1 * a0
2837                a1 = u8 [s0 + 0x23]
2838                jump @0 if a1 != 0
2839            ",
2840            "
2841                DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER.....................................................................  unlikely
2842                DeeeeeeeeeeeeeeeeeeeeeeeeeE---------------R.....................................................................  t1 = u8 [s0]
2843                DeeeeeeeeeeeeeeeeeeeeeeeeeE---------------R.....................................................................  a1 = u8 [s0 + 0x11]
2844                DeE---------------------------------------R.....................................................................  a2 = 0x172d0
2845                .DeeeeeeeeeeeeeeeeeeeeeeeeeE--------------R.....................................................................  a3 = u8 [s0 + 0x16]
2846                .DeE--------------------------------------R.....................................................................  t0 = sp + 0x58
2847                .D========================eE--------------R.....................................................................  a1 = a1 << 0x3
2848                ..D========================eE-------------R.....................................................................  a1 = a1 + a2
2849                ..D=========================eeeeeeeeeeeeeeeeeeeeeeeeeER.........................................................  a2 = u8 [a1]
2850                ..D=========================eeeeeeeeeeeeeeeeeeeeeeeeeER.........................................................  a5 = u8 [a1 + 0x1]
2851                ..D=========================eeeeeeeeeeeeeeeeeeeeeeeeeER.........................................................  s1 = u8 [a1 + 0x2]
2852                ...D========================eeeeeeeeeeeeeeeeeeeeeeeeeER.........................................................  a4 = u8 [a1 + 0x3]
2853                ...D=======================eE-------------------------R.........................................................  a3 = a3 + t0
2854                ...D=================================================eER........................................................  a5 = a5 << 0x8
2855                ...D=================================================eER........................................................  s1 = s1 << 0x10
2856                ....D================================================eER........................................................  a4 = a4 << 0x18
2857                ....D=================================================eER.......................................................  a2 = a2 | a5
2858                ....D================================================eeeeeeeeeeeeeeeeeeeeeeeeeER................................  a5 = u8 [a1 + 0x4]
2859                ....D=================================================eeeeeeeeeeeeeeeeeeeeeeeeeER...............................  a0 = u8 [a1 + 0x5]
2860                .....D================================================eE------------------------R...............................  a4 = a4 | s1
2861                .....D=================================================eeeeeeeeeeeeeeeeeeeeeeeeeER..............................  s1 = u8 [a1 + 0x6]
2862                .....D=================================================eeeeeeeeeeeeeeeeeeeeeeeeeER..............................  a1 = u8 [a1 + 0x7]
2863                .....D=========================================================================eER..............................  a0 = a0 << 0x8
2864                ......D=========================================================================eER.............................  a0 = a0 | a5
2865                ......D=========================================================================eER.............................  s1 = s1 << 0x10
2866                ......D=========================================================================eER.............................  a1 = a1 << 0x18
2867                ......D==========================================================================eER............................  a1 = a1 | s1
2868                .......D======================================================================eE---R............................  a2 = a2 | a4
2869                .......D==========================================================================eER...........................  a0 = a0 | a1
2870                .......D==================eE--------------------------------------------------------R...........................  a1 = s0 - t1
2871                ........D==========================================================================eER..........................  a0 = a0 << 0x20
2872                ........D===========================================================================eER.........................  a0 = a0 | a2
2873                ...........................................D=========================================eeeeeeeeeeeeeeeeeeeeeeeeeER  u64 [sp + 0x58] = a0
2874                ...........................................D===================================eeeeeeeeeeeeeeeeeeeeeeeeeE------R  a0 = u8 [a3]
2875                ...........................................D=====================================eeeeeeeeeeeeeeeeeeeeeeeeeE----R  a1 = u8 [a1 + 0x4]
2876                ...........................................D==============================================================eeeE-R  a0 = a1 * a0
2877                ............................................D====================================eeeeeeeeeeeeeeeeeeeeeeeeeE----R  a1 = u8 [s0 + 0x23]
2878                ............................................D=============================================================eE---R  jump 0 if a1 != 0
2879            ",
2880        );
2881    }
2882
2883    #[test]
2884    fn test_l3_loads() {
2885        assert_timeline(CacheModel::L3Hit,
2886            "
2887                a0 = u64 [a0]
2888                a0 = u64 [a0]
2889                a0 = u64 [a0]
2890                a0 = u64 [a0]
2891                ret
2892            ",
2893            "
2894                DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER...............................................................................................................  a0 = u64 [a0]
2895                D=====================================eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER..........................................................................  a0 = u64 [a0]
2896                D==========================================================================eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER.....................................  a0 = u64 [a0]
2897                D===============================================================================================================eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER  a0 = u64 [a0]
2898                .DeeeeeeeeeeeeeeeeeeeeeeE-----------------------------------------------------------------------------------------------------------------------------R  ret
2899            ",
2900        )
2901    }
2902
2903    #[test]
2904    fn test_ecalli() {
2905        assert_timeline(
2906            test_config(),
2907            "
2908                ecalli 27
2909                ret
2910            ",
2911            "
2912                DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER  ecalli 27
2913                .DeeeeeeeeeeeeeeeeeeeeeeE-----------------------------------------------------------------------------R  ret
2914            ",
2915        );
2916    }
2917
2918    #[test]
2919    fn test_xor_and_shift() {
2920        assert_timeline(
2921            test_config(),
2922            "
2923                a1 = a1 ^ 0xffffffffffffffff
2924                a1 = a0 >> a1
2925                fallthrough
2926            ",
2927            "
2928                DeER..  a1 = a1 ^ 0xffffffffffffffff
2929                D=eER.  a1 = a0 >> a1
2930                .DeeER  fallthrough
2931            ",
2932        )
2933    }
2934
2935    #[test]
2936    fn test_move_reg_decode_slots() {
2937        assert_timeline(
2938            test_config(),
2939            "
2940                s0 = a1
2941                a0 = a1
2942                a1 = t0
2943                a2 = s1
2944                trap
2945            ",
2946            "
2947                D.....  s0 = a1
2948                D.....  a0 = a1
2949                D.....  a1 = t0
2950                D.....  a2 = s1
2951                .DeeER  trap
2952            ",
2953        )
2954    }
2955}