@@ -38,64 +38,125 @@ impl<T> Drop for CopyOnDrop<T> {
38
38
}
39
39
}
40
40
41
- /// Sorts a slice using insertion sort, which is `O(n^2)` worst-case .
42
- fn insertion_sort < T , F > ( v : & mut [ T ] , is_less : & mut F )
41
+ /// Shifts the first element to the right until it encounters a greater or equal element .
42
+ fn shift_head < T , F > ( v : & mut [ T ] , is_less : & mut F )
43
43
where F : FnMut ( & T , & T ) -> bool
44
44
{
45
45
let len = v. len ( ) ;
46
+ unsafe {
47
+ // If the first two elements are out-of-order...
48
+ if len >= 2 && is_less ( v. get_unchecked ( 1 ) , v. get_unchecked ( 0 ) ) {
49
+ // Read the first element into a stack-allocated variable. If a following comparison
50
+ // operation panics, `hole` will get dropped and automatically write the element back
51
+ // into the slice.
52
+ let mut tmp = NoDrop { value : ptr:: read ( v. get_unchecked ( 0 ) ) } ;
53
+ let mut hole = CopyOnDrop {
54
+ src : & mut tmp. value ,
55
+ dest : v. get_unchecked_mut ( 1 ) ,
56
+ } ;
57
+ ptr:: copy_nonoverlapping ( v. get_unchecked ( 1 ) , v. get_unchecked_mut ( 0 ) , 1 ) ;
46
58
47
- for i in 1 ..len {
48
- unsafe {
49
- if is_less ( v. get_unchecked ( i) , v. get_unchecked ( i - 1 ) ) {
50
- // There are three ways to implement insertion here:
51
- //
52
- // 1. Swap adjacent elements until the first one gets to its final destination.
53
- // However, this way we copy data around more than is necessary. If elements are
54
- // big structures (costly to copy), this method will be slow.
55
- //
56
- // 2. Iterate until the right place for the first element is found. Then shift the
57
- // elements succeeding it to make room for it and finally place it into the
58
- // remaining hole. This is a good method.
59
- //
60
- // 3. Copy the first element into a temporary variable. Iterate until the right
61
- // place for it is found. As we go along, copy every traversed element into the
62
- // slot preceding it. Finally, copy data from the temporary variable into the
63
- // remaining hole. This method is very good. Benchmarks demonstrated slightly
64
- // better performance than with the 2nd method.
65
- //
66
- // All methods were benchmarked, and the 3rd showed best results. So we chose that
67
- // one.
68
- let mut tmp = NoDrop { value : ptr:: read ( v. get_unchecked ( i) ) } ;
69
-
70
- // Intermediate state of the insertion process is always tracked by `hole`, which
71
- // serves two purposes:
72
- // 1. Protects integrity of `v` from panics in `is_less`.
73
- // 2. Fills the remaining hole in `v` in the end.
74
- //
75
- // Panic safety:
76
- //
77
- // If `is_less` panics at any point during the process, `hole` will get dropped and
78
- // fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object
79
- // it initially held exactly once.
80
- let mut hole = CopyOnDrop {
81
- src : & mut tmp. value ,
82
- dest : v. get_unchecked_mut ( i - 1 ) ,
83
- } ;
84
- ptr:: copy_nonoverlapping ( v. get_unchecked ( i - 1 ) , v. get_unchecked_mut ( i) , 1 ) ;
85
-
86
- for h in ( 0 ..i-1 ) . rev ( ) {
87
- if !is_less ( & tmp. value , v. get_unchecked ( h) ) {
88
- break ;
89
- }
90
- ptr:: copy_nonoverlapping ( v. get_unchecked ( h) , v. get_unchecked_mut ( h + 1 ) , 1 ) ;
91
- hole. dest = v. get_unchecked_mut ( h) ;
59
+ for i in 2 ..len {
60
+ if !is_less ( & v[ i] , & tmp. value ) {
61
+ break ;
92
62
}
93
- // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`.
63
+
64
+ // Move `i`-th element one place to the left, thus shifting the hole to the right.
65
+ ptr:: copy_nonoverlapping ( v. get_unchecked ( i) , v. get_unchecked_mut ( i - 1 ) , 1 ) ;
66
+ hole. dest = v. get_unchecked_mut ( i) ;
94
67
}
68
+ // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`.
95
69
}
96
70
}
97
71
}
98
72
73
+ /// Shifts the last element to the left until it encounters a smaller or equal element.
74
+ fn shift_tail < T , F > ( v : & mut [ T ] , is_less : & mut F )
75
+ where F : FnMut ( & T , & T ) -> bool
76
+ {
77
+ let len = v. len ( ) ;
78
+ unsafe {
79
+ // If the last two elements are out-of-order...
80
+ if len >= 2 && is_less ( v. get_unchecked ( len - 1 ) , v. get_unchecked ( len - 2 ) ) {
81
+ // Read the last element into a stack-allocated variable. If a following comparison
82
+ // operation panics, `hole` will get dropped and automatically write the element back
83
+ // into the slice.
84
+ let mut tmp = NoDrop { value : ptr:: read ( v. get_unchecked ( len - 1 ) ) } ;
85
+ let mut hole = CopyOnDrop {
86
+ src : & mut tmp. value ,
87
+ dest : v. get_unchecked_mut ( len - 2 ) ,
88
+ } ;
89
+ ptr:: copy_nonoverlapping ( v. get_unchecked ( len - 2 ) , v. get_unchecked_mut ( len - 1 ) , 1 ) ;
90
+
91
+ for i in ( 0 ..len-2 ) . rev ( ) {
92
+ if !is_less ( & tmp. value , v. get_unchecked ( i) ) {
93
+ break ;
94
+ }
95
+
96
+ // Move `i`-th element one place to the right, thus shifting the hole to the left.
97
+ ptr:: copy_nonoverlapping ( v. get_unchecked ( i) , v. get_unchecked_mut ( i + 1 ) , 1 ) ;
98
+ hole. dest = v. get_unchecked_mut ( i) ;
99
+ }
100
+ // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`.
101
+ }
102
+ }
103
+ }
104
+
105
+ /// Partially sorts a slice by shifting several out-of-order elements around.
106
+ ///
107
+ /// Returns true if the slice is sorted at the end. This function is `O(n)` worst-case.
108
+ #[ cold]
109
+ fn partial_insertion_sort < T , F > ( v : & mut [ T ] , is_less : & mut F ) -> bool
110
+ where F : FnMut ( & T , & T ) -> bool
111
+ {
112
+ // Maximum number of adjacent out-of-order pairs that will get shifted.
113
+ const MAX_STEPS : usize = 5 ;
114
+ // If the slice is shorter than this, don't shift any elements.
115
+ const SHORTEST_SHIFTING : usize = 50 ;
116
+
117
+ let len = v. len ( ) ;
118
+ let mut i = 1 ;
119
+
120
+ for _ in 0 ..MAX_STEPS {
121
+ unsafe {
122
+ // Find the next pair of adjacent out-of-order elements.
123
+ while i < len && !is_less ( v. get_unchecked ( i) , v. get_unchecked ( i - 1 ) ) {
124
+ i += 1 ;
125
+ }
126
+ }
127
+
128
+ // Are we done?
129
+ if i == len {
130
+ return true ;
131
+ }
132
+
133
+ // Don't shift elements on short arrays, that has a performance cost.
134
+ if len < SHORTEST_SHIFTING {
135
+ return false ;
136
+ }
137
+
138
+ // Swap the found pair of elements. This puts them in correct order.
139
+ v. swap ( i - 1 , i) ;
140
+
141
+ // Shift the smaller element to the left.
142
+ shift_tail ( & mut v[ ..i] , is_less) ;
143
+ // Shift the greater element to the right.
144
+ shift_head ( & mut v[ i..] , is_less) ;
145
+ }
146
+
147
+ // Didn't manage to sort the slice in the limited number of steps.
148
+ false
149
+ }
150
+
151
+ /// Sorts a slice using insertion sort, which is `O(n^2)` worst-case.
152
+ fn insertion_sort < T , F > ( v : & mut [ T ] , is_less : & mut F )
153
+ where F : FnMut ( & T , & T ) -> bool
154
+ {
155
+ for i in 2 ..v. len ( ) +1 {
156
+ shift_tail ( & mut v[ ..i] , is_less) ;
157
+ }
158
+ }
159
+
99
160
/// Sorts `v` using heapsort, which guarantees `O(n log n)` worst-case.
100
161
#[ cold]
101
162
fn heapsort < T , F > ( v : & mut [ T ] , is_less : & mut F )
@@ -180,6 +241,9 @@ fn partition_in_blocks<T, F>(v: &mut [T], pivot: &T, is_less: &mut F) -> usize
180
241
let mut end_r = ptr:: null_mut ( ) ;
181
242
let mut offsets_r: [ u8 ; BLOCK ] = unsafe { mem:: uninitialized ( ) } ;
182
243
244
+ // FIXME: When we get VLAs, try creating one array of length `min(v.len(), 2 * BLOCK)` rather
245
+ // than two fixed-size arrays of length `BLOCK`. VLAs might be more cache-efficient.
246
+
183
247
// Returns the number of elements between pointers `l` (inclusive) and `r` (exclusive).
184
248
fn width < T > ( l : * mut T , r : * mut T ) -> usize {
185
249
assert ! ( mem:: size_of:: <T >( ) > 0 ) ;
@@ -470,10 +534,10 @@ fn break_patterns<T>(v: &mut [T]) {
470
534
fn choose_pivot < T , F > ( v : & mut [ T ] , is_less : & mut F ) -> ( usize , bool )
471
535
where F : FnMut ( & T , & T ) -> bool
472
536
{
473
- // Minimal length to choose the median-of-medians method.
537
+ // Minimum length to choose the median-of-medians method.
474
538
// Shorter slices use the simple median-of-three method.
475
- const SHORTEST_MEDIAN_OF_MEDIANS : usize = 80 ;
476
- // Maximal number of swaps that can be performed in this function.
539
+ const SHORTEST_MEDIAN_OF_MEDIANS : usize = 50 ;
540
+ // Maximum number of swaps that can be performed in this function.
477
541
const MAX_SWAPS : usize = 4 * 3 ;
478
542
479
543
let len = v. len ( ) ;
@@ -522,7 +586,7 @@ fn choose_pivot<T, F>(v: &mut [T], is_less: &mut F) -> (usize, bool)
522
586
if swaps < MAX_SWAPS {
523
587
( b, swaps == 0 )
524
588
} else {
525
- // The maximal number of swaps was performed. Chances are the slice is descending or mostly
589
+ // The maximum number of swaps was performed. Chances are the slice is descending or mostly
526
590
// descending, so reversing will probably help sort it faster.
527
591
v. reverse ( ) ;
528
592
( len - 1 - b, true )
@@ -575,8 +639,9 @@ fn recurse<'a, T, F>(mut v: &'a mut [T], is_less: &mut F, mut pred: Option<&'a T
575
639
// If the last partitioning was decently balanced and didn't shuffle elements, and if pivot
576
640
// selection predicts the slice is likely already sorted...
577
641
if was_balanced && was_partitioned && likely_sorted {
578
- // Check whether the slice really is sorted. If so, we're done.
579
- if v. windows ( 2 ) . all ( |w| !is_less ( & w[ 1 ] , & w[ 0 ] ) ) {
642
+ // Try identifying several out-of-order elements and shifting them to correct
643
+ // positions. If the slice ends up being completely sorted, we're done.
644
+ if partial_insertion_sort ( v, is_less) {
580
645
return ;
581
646
}
582
647
}
0 commit comments