regalloc2/moves.rs
1/*
2 * Released under the terms of the Apache 2.0 license with LLVM
3 * exception. See `LICENSE` for details.
4 */
5
6use crate::{ion::data_structures::u64_key, Allocation, PReg};
7use smallvec::{smallvec, SmallVec};
8use std::fmt::Debug;
9
10/// A list of moves to be performed in sequence, with auxiliary data
11/// attached to each.
12pub type MoveVec<T> = SmallVec<[(Allocation, Allocation, T); 16]>;
13
14/// A list of moves to be performance in sequence, like a
15/// `MoveVec<T>`, except that an unchosen scratch space may occur as
16/// well, represented by `Allocation::none()`.
17#[derive(Clone, Debug)]
18pub enum MoveVecWithScratch<T> {
19 /// No scratch was actually used.
20 NoScratch(MoveVec<T>),
21 /// A scratch space was used.
22 Scratch(MoveVec<T>),
23}
24
25/// A `ParallelMoves` represents a list of alloc-to-alloc moves that
26/// must happen in parallel -- i.e., all reads of sources semantically
27/// happen before all writes of destinations, and destinations are
28/// allowed to overwrite sources. It can compute a list of sequential
29/// moves that will produce the equivalent data movement, possibly
30/// using a scratch register if one is necessary.
31pub struct ParallelMoves<T: Clone + Copy + Default> {
32 parallel_moves: MoveVec<T>,
33}
34
35impl<T: Clone + Copy + Default> ParallelMoves<T> {
36 pub fn new() -> Self {
37 Self {
38 parallel_moves: smallvec![],
39 }
40 }
41
42 pub fn add(&mut self, from: Allocation, to: Allocation, t: T) {
43 self.parallel_moves.push((from, to, t));
44 }
45
46 fn sources_overlap_dests(&self) -> bool {
47 // Assumes `parallel_moves` has already been sorted in `resolve()` below.
48 for &(_, dst, _) in &self.parallel_moves {
49 if self
50 .parallel_moves
51 .binary_search_by_key(&dst, |&(src, _, _)| src)
52 .is_ok()
53 {
54 return true;
55 }
56 }
57 false
58 }
59
60 /// Resolve the parallel-moves problem to a sequence of separate
61 /// moves, such that the combined effect of the sequential moves
62 /// is as-if all of the moves added to this `ParallelMoves`
63 /// resolver happened in parallel.
64 ///
65 /// Sometimes, if there is a cycle, a scratch register is
66 /// necessary to allow the moves to occur sequentially. In this
67 /// case, `Allocation::none()` is returned to represent the
68 /// scratch register. The caller may choose to always hold a
69 /// separate scratch register unused to allow this to be trivially
70 /// rewritten; or may dynamically search for or create a free
71 /// register as needed, if none are available.
72 pub fn resolve(mut self) -> MoveVecWithScratch<T> {
73 // Easy case: zero or one move. Just return our vec.
74 if self.parallel_moves.len() <= 1 {
75 return MoveVecWithScratch::NoScratch(self.parallel_moves);
76 }
77
78 // Sort moves by source so that we can efficiently test for
79 // presence.
80 self.parallel_moves
81 .sort_by_key(|&(src, dst, _)| u64_key(src.bits(), dst.bits()));
82
83 // Do any dests overlap sources? If not, we can also just
84 // return the list.
85 if !self.sources_overlap_dests() {
86 return MoveVecWithScratch::NoScratch(self.parallel_moves);
87 }
88
89 // General case: some moves overwrite dests that other moves
90 // read as sources. We'll use a general algorithm.
91 //
92 // *Important property*: because we expect that each register
93 // has only one writer (otherwise the effect of the parallel
94 // move is undefined), each move can only block one other move
95 // (with its one source corresponding to the one writer of
96 // that source). Thus, we *can only have simple cycles* (those
97 // that are a ring of nodes, i.e., with only one path from a
98 // node back to itself); there are no SCCs that are more
99 // complex than that. We leverage this fact below to avoid
100 // having to do a full Tarjan SCC DFS (with lowest-index
101 // computation, etc.): instead, as soon as we find a cycle, we
102 // know we have the full cycle and we can do a cyclic move
103 // sequence and continue.
104
105 // Sort moves by destination and check that each destination
106 // has only one writer.
107 self.parallel_moves.sort_by_key(|&(_, dst, _)| dst);
108 if cfg!(debug_assertions) {
109 let mut last_dst = None;
110 for &(_, dst, _) in &self.parallel_moves {
111 if last_dst.is_some() {
112 debug_assert!(last_dst.unwrap() != dst);
113 }
114 last_dst = Some(dst);
115 }
116 }
117
118 // Construct a mapping from move indices to moves they must
119 // come before. Any given move must come before a move that
120 // overwrites its destination; we have moves sorted by dest
121 // above so we can efficiently find such a move, if any.
122 let mut must_come_before: SmallVec<[Option<usize>; 16]> =
123 smallvec![None; self.parallel_moves.len()];
124 for (i, &(src, _, _)) in self.parallel_moves.iter().enumerate() {
125 if let Ok(move_to_dst_idx) = self
126 .parallel_moves
127 .binary_search_by_key(&src, |&(_, dst, _)| dst)
128 {
129 must_come_before[i] = Some(move_to_dst_idx);
130 }
131 }
132
133 // Do a simple stack-based DFS and emit moves in postorder,
134 // then reverse at the end for RPO. Unlike Tarjan's SCC
135 // algorithm, we can emit a cycle as soon as we find one, as
136 // noted above.
137 let mut ret: MoveVec<T> = smallvec![];
138 let mut stack: SmallVec<[usize; 16]> = smallvec![];
139 let mut visited: SmallVec<[bool; 16]> = smallvec![false; self.parallel_moves.len()];
140 let mut onstack: SmallVec<[bool; 16]> = smallvec![false; self.parallel_moves.len()];
141 let mut scratch_used = false;
142
143 stack.push(0);
144 onstack[0] = true;
145 loop {
146 if stack.is_empty() {
147 if let Some(next) = visited.iter().position(|&flag| !flag) {
148 stack.push(next);
149 onstack[next] = true;
150 } else {
151 break;
152 }
153 }
154
155 let top = *stack.last().unwrap();
156 visited[top] = true;
157 match must_come_before[top] {
158 None => {
159 ret.push(self.parallel_moves[top]);
160 onstack[top] = false;
161 stack.pop();
162 while let Some(top) = stack.pop() {
163 ret.push(self.parallel_moves[top]);
164 onstack[top] = false;
165 }
166 }
167 Some(next) if visited[next] && !onstack[next] => {
168 ret.push(self.parallel_moves[top]);
169 onstack[top] = false;
170 stack.pop();
171 while let Some(top) = stack.pop() {
172 ret.push(self.parallel_moves[top]);
173 onstack[top] = false;
174 }
175 }
176 Some(next) if !visited[next] && !onstack[next] => {
177 stack.push(next);
178 onstack[next] = true;
179 continue;
180 }
181 Some(next) => {
182 // Found a cycle -- emit a cyclic-move sequence
183 // for the cycle on the top of stack, then normal
184 // moves below it. Recall that these moves will be
185 // reversed in sequence, so from the original
186 // parallel move set
187 //
188 // { B := A, C := B, A := B }
189 //
190 // we will generate something like:
191 //
192 // A := scratch
193 // B := A
194 // C := B
195 // scratch := C
196 //
197 // which will become:
198 //
199 // scratch := C
200 // C := B
201 // B := A
202 // A := scratch
203 let mut last_dst = None;
204 let mut scratch_src = None;
205 while let Some(move_idx) = stack.pop() {
206 onstack[move_idx] = false;
207 let (mut src, dst, dst_t) = self.parallel_moves[move_idx];
208 if last_dst.is_none() {
209 scratch_src = Some(src);
210 src = Allocation::none();
211 scratch_used = true;
212 } else {
213 debug_assert_eq!(last_dst.unwrap(), src);
214 }
215 ret.push((src, dst, dst_t));
216
217 last_dst = Some(dst);
218
219 if move_idx == next {
220 break;
221 }
222 }
223 if let Some(src) = scratch_src {
224 ret.push((src, Allocation::none(), T::default()));
225 }
226 }
227 }
228 }
229
230 ret.reverse();
231
232 if scratch_used {
233 MoveVecWithScratch::Scratch(ret)
234 } else {
235 MoveVecWithScratch::NoScratch(ret)
236 }
237 }
238}
239
240impl<T> MoveVecWithScratch<T> {
241 /// Fills in the scratch space, if needed, with the given
242 /// register/allocation and returns a final list of moves. The
243 /// scratch register must not occur anywhere in the parallel-move
244 /// problem given to the resolver that produced this
245 /// `MoveVecWithScratch`.
246 pub fn with_scratch(self, scratch: Allocation) -> MoveVec<T> {
247 match self {
248 MoveVecWithScratch::NoScratch(moves) => moves,
249 MoveVecWithScratch::Scratch(mut moves) => {
250 for (src, dst, _) in &mut moves {
251 debug_assert!(
252 *src != scratch && *dst != scratch,
253 "Scratch register should not also be an actual source or dest of moves"
254 );
255 debug_assert!(
256 !(src.is_none() && dst.is_none()),
257 "Move resolution should not have produced a scratch-to-scratch move"
258 );
259 if src.is_none() {
260 *src = scratch;
261 }
262 if dst.is_none() {
263 *dst = scratch;
264 }
265 }
266 moves
267 }
268 }
269 }
270
271 /// Unwrap without a scratch register.
272 pub fn without_scratch(self) -> Option<MoveVec<T>> {
273 match self {
274 MoveVecWithScratch::NoScratch(moves) => Some(moves),
275 MoveVecWithScratch::Scratch(..) => None,
276 }
277 }
278
279 /// Do we need a scratch register?
280 pub fn needs_scratch(&self) -> bool {
281 match self {
282 MoveVecWithScratch::NoScratch(..) => false,
283 MoveVecWithScratch::Scratch(..) => true,
284 }
285 }
286
287 /// Do any moves go from stack to stack?
288 pub fn stack_to_stack(&self, is_stack_alloc: impl Fn(Allocation) -> bool) -> bool {
289 match self {
290 MoveVecWithScratch::NoScratch(moves) | MoveVecWithScratch::Scratch(moves) => moves
291 .iter()
292 .any(|&(src, dst, _)| is_stack_alloc(src) && is_stack_alloc(dst)),
293 }
294 }
295}
296
297/// Final stage of move resolution: finding or using scratch
298/// registers, creating them if necessary by using stackslots, and
299/// ensuring that the final list of moves contains no stack-to-stack
300/// moves.
301///
302/// The resolved list of moves may need one or two scratch registers,
303/// and maybe a stackslot, to ensure these conditions. Our general
304/// strategy is in two steps.
305///
306/// First, we find a scratch register, so we only have to worry about
307/// a list of moves, all with real locations as src and dest. If we're
308/// lucky and there are any registers not allocated at this
309/// program-point, we can use a real register. Otherwise, we use an
310/// extra stackslot. This is fine, because at this step,
311/// stack-to-stack moves are OK.
312///
313/// Then, we resolve stack-to-stack moves into stack-to-reg /
314/// reg-to-stack pairs. For this, we try to allocate a second free
315/// register. If unavailable, we create another scratch stackslot, and
316/// we pick a "victim" register in the appropriate class, and we
317/// resolve into: victim -> extra-stackslot; stack-src -> victim;
318/// victim -> stack-dst; extra-stackslot -> victim.
319///
320/// Sometimes move elision will be able to clean this up a bit. But,
321/// for simplicity reasons, let's keep the concerns separated! So we
322/// always do the full expansion above.
323pub struct MoveAndScratchResolver<GetReg, GetStackSlot, IsStackAlloc>
324where
325 GetReg: FnMut() -> Option<Allocation>,
326 GetStackSlot: FnMut() -> Allocation,
327 IsStackAlloc: Fn(Allocation) -> bool,
328{
329 /// Scratch register for stack-to-stack move expansion.
330 stack_stack_scratch_reg: Option<Allocation>,
331 /// Stackslot into which we need to save the stack-to-stack
332 /// scratch reg before doing any stack-to-stack moves, if we stole
333 /// the reg.
334 stack_stack_scratch_reg_save: Option<Allocation>,
335 /// Closure that finds us a PReg at the current location.
336 find_free_reg: GetReg,
337 /// Closure that gets us a stackslot, if needed.
338 get_stackslot: GetStackSlot,
339 /// Closure to determine whether an `Allocation` refers to a stack slot.
340 is_stack_alloc: IsStackAlloc,
341 /// The victim PReg to evict to another stackslot at every
342 /// stack-to-stack move if a free PReg is not otherwise
343 /// available. Provided by caller and statically chosen. This is a
344 /// very last-ditch option, so static choice is OK.
345 victim: PReg,
346}
347
348impl<GetReg, GetStackSlot, IsStackAlloc> MoveAndScratchResolver<GetReg, GetStackSlot, IsStackAlloc>
349where
350 GetReg: FnMut() -> Option<Allocation>,
351 GetStackSlot: FnMut() -> Allocation,
352 IsStackAlloc: Fn(Allocation) -> bool,
353{
354 pub fn new(
355 find_free_reg: GetReg,
356 get_stackslot: GetStackSlot,
357 is_stack_alloc: IsStackAlloc,
358 victim: PReg,
359 ) -> Self {
360 Self {
361 stack_stack_scratch_reg: None,
362 stack_stack_scratch_reg_save: None,
363 find_free_reg,
364 get_stackslot,
365 is_stack_alloc,
366 victim,
367 }
368 }
369
370 pub fn compute<T: Debug + Copy>(mut self, moves: MoveVecWithScratch<T>) -> MoveVec<T> {
371 // First, do we have a vec with no stack-to-stack moves or use
372 // of a scratch register? Fast return if so.
373 if !moves.needs_scratch() && !moves.stack_to_stack(&self.is_stack_alloc) {
374 return moves.without_scratch().unwrap();
375 }
376
377 let mut result = smallvec![];
378
379 // Now, find a scratch allocation in order to resolve cycles.
380 let scratch = (self.find_free_reg)().unwrap_or_else(|| (self.get_stackslot)());
381 trace!("scratch resolver: scratch alloc {:?}", scratch);
382
383 let moves = moves.with_scratch(scratch);
384 for &(src, dst, data) in &moves {
385 // Do we have a stack-to-stack move? If so, resolve.
386 if (self.is_stack_alloc)(src) && (self.is_stack_alloc)(dst) {
387 trace!("scratch resolver: stack to stack: {:?} -> {:?}", src, dst);
388 // Lazily allocate a stack-to-stack scratch.
389 if self.stack_stack_scratch_reg.is_none() {
390 if let Some(reg) = (self.find_free_reg)() {
391 trace!(
392 "scratch resolver: have free stack-to-stack scratch preg: {:?}",
393 reg
394 );
395 self.stack_stack_scratch_reg = Some(reg);
396 } else {
397 self.stack_stack_scratch_reg = Some(Allocation::reg(self.victim));
398 self.stack_stack_scratch_reg_save = Some((self.get_stackslot)());
399 trace!("scratch resolver: stack-to-stack using victim {:?} with save stackslot {:?}",
400 self.stack_stack_scratch_reg,
401 self.stack_stack_scratch_reg_save);
402 }
403 }
404
405 // If we have a "victimless scratch", then do a
406 // stack-to-scratch / scratch-to-stack sequence.
407 if self.stack_stack_scratch_reg_save.is_none() {
408 result.push((src, self.stack_stack_scratch_reg.unwrap(), data));
409 result.push((self.stack_stack_scratch_reg.unwrap(), dst, data));
410 }
411 // Otherwise, save the current value in the
412 // stack-to-stack scratch reg (which is our victim) to
413 // the extra stackslot, then do the stack-to-scratch /
414 // scratch-to-stack sequence, then restore it.
415 else {
416 result.push((
417 self.stack_stack_scratch_reg.unwrap(),
418 self.stack_stack_scratch_reg_save.unwrap(),
419 data,
420 ));
421 result.push((src, self.stack_stack_scratch_reg.unwrap(), data));
422 result.push((self.stack_stack_scratch_reg.unwrap(), dst, data));
423 result.push((
424 self.stack_stack_scratch_reg_save.unwrap(),
425 self.stack_stack_scratch_reg.unwrap(),
426 data,
427 ));
428 }
429 } else {
430 // Normal move.
431 result.push((src, dst, data));
432 }
433 }
434
435 trace!("scratch resolver: got {:?}", result);
436 result
437 }
438}