@@ -13,7 +13,7 @@ use crate::scoped_hash_map::ScopedHashMap;
1313use crate :: trace;
1414use alloc:: vec:: Vec ;
1515use cranelift_control:: ControlPlane ;
16- use cranelift_entity:: { SecondaryMap , packed_option:: ReservedValue } ;
16+ use cranelift_entity:: { EntitySet , SecondaryMap , packed_option:: ReservedValue } ;
1717use rustc_hash:: { FxHashMap , FxHashSet } ;
1818use smallvec:: { SmallVec , smallvec} ;
1919
@@ -219,7 +219,72 @@ impl<'a> Elaborator<'a> {
219219 self . cur_block = block;
220220 }
221221
222+ fn topo_sorted_values ( & self ) -> Vec < Value > {
223+ #[ derive( Debug ) ]
224+ enum Event {
225+ Enter ,
226+ Exit ,
227+ }
228+ let mut stack = Vec :: < ( Event , Value ) > :: new ( ) ;
229+
230+ // Traverse the CFG in pre-order so that, when we look at the
231+ // instructions and operands inside each block, we see value defs before
232+ // uses.
233+ for block in crate :: traversals:: Dfs :: new ( ) . pre_order_iter ( & self . func ) {
234+ for inst in self . func . layout . block_insts ( block) {
235+ stack. extend ( self . func . dfg . inst_values ( inst) . map ( |v| ( Event :: Enter , v) ) ) ;
236+ }
237+ }
238+
239+ // We pushed in the desired order, so popping would implicitly reverse
240+ // that. Avoid that by reversing the initial stack before we start
241+ // traversing the DFG.
242+ stack. reverse ( ) ;
243+
244+ let mut sorted = Vec :: with_capacity ( self . func . dfg . values ( ) . len ( ) ) ;
245+ let mut seen = EntitySet :: < Value > :: with_capacity ( self . func . dfg . values ( ) . len ( ) ) ;
246+
247+ // Post-order traversal of the DFG, visiting value defs before uses.
248+ while let Some ( ( event, value) ) = stack. pop ( ) {
249+ match event {
250+ Event :: Enter => {
251+ if seen. insert ( value) {
252+ stack. push ( ( Event :: Exit , value) ) ;
253+ match self . func . dfg . value_def ( value) {
254+ ValueDef :: Result ( inst, _) => {
255+ stack. extend (
256+ self . func
257+ . dfg
258+ . inst_values ( inst)
259+ . rev ( )
260+ . filter ( |v| !seen. contains ( * v) )
261+ . map ( |v| ( Event :: Enter , v) ) ,
262+ ) ;
263+ }
264+ ValueDef :: Union ( a, b) => {
265+ if !seen. contains ( b) {
266+ stack. push ( ( Event :: Enter , b) ) ;
267+ }
268+ if !seen. contains ( a) {
269+ stack. push ( ( Event :: Enter , a) ) ;
270+ }
271+ }
272+ ValueDef :: Param ( ..) => { }
273+ }
274+ }
275+ }
276+ Event :: Exit => {
277+ sorted. push ( value) ;
278+ }
279+ }
280+ }
281+
282+ sorted
283+ }
284+
222285 fn compute_best_values ( & mut self ) {
286+ let sorted_values = self . topo_sorted_values ( ) ;
287+
223288 let best = & mut self . value_to_best_value ;
224289
225290 // We can't make random decisions inside the fixpoint loop below because
@@ -230,126 +295,98 @@ impl<'a> Elaborator<'a> {
230295 // how to do instead of the best.
231296 let use_worst = self . ctrl_plane . get_decision ( ) ;
232297
233- // Do a fixpoint loop to compute the best value for each eclass.
234- //
235- // The maximum number of iterations is the length of the longest chain
236- // of `vNN -> vMM` edges in the dataflow graph where `NN < MM`, so this
237- // is *technically* quadratic, but `cranelift-frontend` won't construct
238- // any such edges. NaN canonicalization will introduce some of these
239- // edges, but they are chains of only two or three edges. So in
240- // practice, we *never* do more than a handful of iterations here unless
241- // (a) we parsed the CLIF from text and the text was funkily numbered,
242- // which we don't really care about, or (b) the CLIF producer did
243- // something weird, in which case it is their responsibility to stop
244- // doing that.
245298 trace ! (
246- "Entering fixpoint loop to compute the {} values for each eclass" ,
299+ "Computing the {} values for each eclass" ,
247300 if use_worst {
248301 "worst (chaos mode)"
249302 } else {
250303 "best"
251304 }
252305 ) ;
253- let mut keep_going = true ;
254- while keep_going {
255- keep_going = false ;
256- trace ! (
257- "fixpoint iteration {}" ,
258- self . stats. elaborate_best_cost_fixpoint_iters
259- ) ;
260- self . stats . elaborate_best_cost_fixpoint_iters += 1 ;
261-
262- for ( value, def) in self . func . dfg . values_and_defs ( ) {
263- trace ! ( "computing best for value {:?} def {:?}" , value, def) ;
264- let orig_best_value = best[ value] ;
265-
266- match def {
267- ValueDef :: Union ( x, y) => {
268- // Pick the best of the two options based on
269- // min-cost. This works because each element of `best`
270- // is a `(cost, value)` tuple; `cost` comes first so
271- // the natural comparison works based on cost, and
272- // breaks ties based on value number.
273- best[ value] = if use_worst {
274- if best[ x] . 1 . is_reserved_value ( ) {
275- best[ y]
276- } else if best[ y] . 1 . is_reserved_value ( ) {
277- best[ x]
278- } else {
279- std:: cmp:: max ( best[ x] , best[ y] )
280- }
281- } else {
282- std:: cmp:: min ( best[ x] , best[ y] )
283- } ;
284- trace ! (
285- " -> best of union({:?}, {:?}) = {:?}" ,
286- best[ x] , best[ y] , best[ value]
287- ) ;
288- }
289- ValueDef :: Param ( _, _) => {
306+
307+ // Because the values are topologically sorted, we know that we will see
308+ // defs before uses, so an instruction's operands' costs will already be
309+ // computed by the time we are computing the cost for the current value
310+ // and its instruction.
311+ for value in sorted_values. iter ( ) . copied ( ) {
312+ let def = self . func . dfg . value_def ( value) ;
313+ trace ! ( "computing best for value {:?} def {:?}" , value, def) ;
314+
315+ match def {
316+ // Pick the best of the two options based on min-cost. This
317+ // works because each element of `best` is a `(cost, value)`
318+ // tuple; `cost` comes first so the natural comparison works
319+ // based on cost, and breaks ties based on value number.
320+ ValueDef :: Union ( x, y) => {
321+ debug_assert ! ( !best[ x] . 1 . is_reserved_value( ) ) ;
322+ debug_assert ! ( !best[ y] . 1 . is_reserved_value( ) ) ;
323+ best[ value] = if use_worst {
324+ std:: cmp:: max ( best[ x] , best[ y] )
325+ } else {
326+ std:: cmp:: min ( best[ x] , best[ y] )
327+ } ;
328+ trace ! (
329+ " -> best of union({:?}, {:?}) = {:?}" ,
330+ best[ x] , best[ y] , best[ value]
331+ ) ;
332+ }
333+
334+ ValueDef :: Param ( _, _) => {
335+ best[ value] = BestEntry ( Cost :: zero ( ) , value) ;
336+ }
337+
338+ // If the Inst is inserted into the layout (which is,
339+ // at this point, only the side-effecting skeleton),
340+ // then it must be computed and thus we give it zero
341+ // cost.
342+ ValueDef :: Result ( inst, _) => {
343+ if let Some ( _) = self . func . layout . inst_block ( inst) {
290344 best[ value] = BestEntry ( Cost :: zero ( ) , value) ;
345+ } else {
346+ let inst_data = & self . func . dfg . insts [ inst] ;
347+ // N.B.: at this point we know that the opcode is
348+ // pure, so `pure_op_cost`'s precondition is
349+ // satisfied.
350+ let cost = Cost :: of_pure_op (
351+ inst_data. opcode ( ) ,
352+ self . func . dfg . inst_values ( inst) . map ( |value| {
353+ debug_assert ! ( !best[ value] . 1 . is_reserved_value( ) ) ;
354+ best[ value] . 0
355+ } ) ,
356+ ) ;
357+ best[ value] = BestEntry ( cost, value) ;
358+ trace ! ( " -> cost of value {} = {:?}" , value, cost) ;
291359 }
292- // If the Inst is inserted into the layout (which is,
293- // at this point, only the side-effecting skeleton),
294- // then it must be computed and thus we give it zero
295- // cost.
296- ValueDef :: Result ( inst, _) => {
297- if let Some ( _) = self . func . layout . inst_block ( inst) {
298- best[ value] = BestEntry ( Cost :: zero ( ) , value) ;
299- } else {
300- let inst_data = & self . func . dfg . insts [ inst] ;
301- // N.B.: at this point we know that the opcode is
302- // pure, so `pure_op_cost`'s precondition is
303- // satisfied.
304- let cost = Cost :: of_pure_op (
305- inst_data. opcode ( ) ,
306- self . func . dfg . inst_values ( inst) . map ( |value| best[ value] . 0 ) ,
307- ) ;
308- best[ value] = BestEntry ( cost, value) ;
309- trace ! ( " -> cost of value {} = {:?}" , value, cost) ;
310- }
311- }
312- } ;
313-
314- // Keep on iterating the fixpoint loop while we are finding new
315- // best values.
316- keep_going |= orig_best_value != best[ value] ;
317- }
318- }
360+ }
361+ } ;
319362
320- if cfg ! ( any( feature = "trace-log" , debug_assertions) ) {
321- trace ! ( "finished fixpoint loop to compute best value for each eclass" ) ;
322- for value in self . func . dfg . values ( ) {
323- trace ! ( "-> best for eclass {:?}: {:?}" , value, best[ value] ) ;
324- debug_assert_ne ! ( best[ value] . 1 , Value :: reserved_value( ) ) ;
325- // You might additionally be expecting an assert that the best
326- // cost is not infinity, however infinite cost *can* happen in
327- // practice. First, note that our cost function doesn't know
328- // about any shared structure in the dataflow graph, it only
329- // sums operand costs. (And trying to avoid that by deduping a
330- // single operation's operands is a losing game because you can
331- // always just add one indirection and go from `add(x, x)` to
332- // `add(foo(x), bar(x))` to hide the shared structure.) Given
333- // that blindness to sharing, we can make cost grow
334- // exponentially with a linear sequence of operations:
335- //
336- // v0 = iconst.i32 1 ;; cost = 1
337- // v1 = iadd v0, v0 ;; cost = 3 + 1 + 1
338- // v2 = iadd v1, v1 ;; cost = 3 + 5 + 5
339- // v3 = iadd v2, v2 ;; cost = 3 + 13 + 13
340- // v4 = iadd v3, v3 ;; cost = 3 + 29 + 29
341- // v5 = iadd v4, v4 ;; cost = 3 + 61 + 61
342- // v6 = iadd v5, v5 ;; cost = 3 + 125 + 125
343- // ;; etc...
344- //
345- // Such a chain can cause cost to saturate to infinity. How do
346- // we choose which e-node is best when there are multiple that
347- // have saturated to infinity? It doesn't matter. As long as
348- // invariant (2) for optimization rules is upheld by our rule
349- // set (see `cranelift/codegen/src/opts/README.md`) it is safe
350- // to choose *any* e-node in the e-class. At worst we will
351- // produce suboptimal code, but never an incorrectness.
352- }
363+ // You might be expecting an assert that the best cost we just
364+ // computed is not infinity, however infinite cost *can* happen in
365+ // practice. First, note that our cost function doesn't know about
366+ // any shared structure in the dataflow graph, it only sums operand
367+ // costs. (And trying to avoid that by deduping a single operation's
368+ // operands is a losing game because you can always just add one
369+ // indirection and go from `add(x, x)` to `add(foo(x), bar(x))` to
370+ // hide the shared structure.) Given that blindness to sharing, we
371+ // can make cost grow exponentially with a linear sequence of
372+ // operations:
373+ //
374+ // v0 = iconst.i32 1 ;; cost = 1
375+ // v1 = iadd v0, v0 ;; cost = 3 + 1 + 1
376+ // v2 = iadd v1, v1 ;; cost = 3 + 5 + 5
377+ // v3 = iadd v2, v2 ;; cost = 3 + 13 + 13
378+ // v4 = iadd v3, v3 ;; cost = 3 + 29 + 29
379+ // v5 = iadd v4, v4 ;; cost = 3 + 61 + 61
380+ // v6 = iadd v5, v5 ;; cost = 3 + 125 + 125
381+ // ;; etc...
382+ //
383+ // Such a chain can cause cost to saturate to infinity. How do we
384+ // choose which e-node is best when there are multiple that have
385+ // saturated to infinity? It doesn't matter. As long as invariant
386+ // (2) for optimization rules is upheld by our rule set (see
387+ // `cranelift/codegen/src/opts/README.md`) it is safe to choose
388+ // *any* e-node in the e-class. At worst we will produce suboptimal
389+ // code, but never an incorrectness.
353390 }
354391 }
355392
0 commit comments