TimSort——优化了的归并排序
-
具体算法:
TimSort在经典的归并排序的基础上,增加了以下特点- I.规定了分组的最小长度,如果分组长度小于最小长度且不是最后一个分组时,将扩充到最小长度分组
- II.设立一个待排序列栈,最大的待排序列数量为3,没进入一个序列将触发一次合并
- III.截取两个分组中需要交换的序列进行交换而不需要交换位置的元素直接放入a中
- IV.在交换位置过程中,出现连续比较胜出的序列将被整体交换以减少逐个交换时产生的开销
-
应用:在Java中用在对对象数组的排序,原因在于,相比较基本数据类型的比较,对象的比较需要实现comparator接口,开销更大,因此需要尽可能避免逐个比较,采用TimSort是很好的选择。
//java.util.TimSort<T>代码清单分析
static <T> void sort(T[] a, int lo, int hi, Comparator<? super T> c,
T[] work, int workBase, int workLen) {
assert c != null && a != null && lo >= 0 && lo <= hi && hi <= a.length;
int nRemaining = hi - lo;
if (nRemaining < 2)
return; // Arrays of size 0 and 1 are always sorted
//当数组长度较小的时候,使用没有归并的Timsort————二进制排序即可
if (nRemaining < MIN_MERGE) {
int initRunLen = countRunAndMakeAscending(a, lo, hi, c);//找出第一个有序序列并确保是上升序列(若是下降序列则转换成下降序列)
binarySort(a, lo, hi, lo + initRunLen, c);
return;
}
TimSort<T> ts = new TimSort<>(a, c, work, workBase, workLen);
int minRun = minRunLength(nRemaining);//获得序列分组最小长度minRun
do {
//截取接着的上升有序序列
int runLen = countRunAndMakeAscending(a, lo, hi, c);
//如果该序列长度小于minRun,在nRemaining不小于minRun时则将该序列往后扩充到minRun,否则不扩充。
if (runLen < minRun) {
int force = nRemaining <= minRun ? nRemaining : minRun;
binarySort(a, lo, lo + force, lo + runLen, c);
runLen = force;
}
//将该序列分组的长度写入待排栈中,runBase[i] + runLen[i] == runBase[i + 1]
ts.pushRun(lo, runLen);
//执行排序
ts.mergeCollapse();
// Advance to find next run
lo += runLen;
nRemaining -= runLen;
} while (nRemaining != 0);
// Merge all remaining runs to complete sort
assert lo == hi;
ts.mergeForceCollapse();
assert ts.stackSize == 1;
}
//将得到的新序列分组和一个合并后的分组进行合并
private void mergeCollapse() {
//由于循环条件时stackSize > 1而stackSize在进入这个方法之前是从0递增的,因此,当stackSize=2(即已有两个序列被放入)的时候就会进入循环。未合并的分组决定不对于3组
while (stackSize > 1) {
int n = stackSize - 2;
n分组的长度已经大于n+1分组的长度不进行分组跳出分组合并后,run会增加一个分组,此时如果runLen[n-1] <= runlen[n] + runLen[n+1]且runLen[n - 1] < runLen[n + 1]时,合并n-1分组和n分组
if (n > 0 && runLen[n-1] <= runlen[n] + runLen[n+1]) {
if (runLen[n - 1] < runLen[n + 1])
n--;
mergeAt(n);
} else if (runLen[n] <= runLen[n + 1]) {//n=0,由于执行mergeAt方法后stackSize会自减一,因此一般来说,n总是为0。但如果n分组的长度已经大于n+1分组的长度,则会跳过分组合并,此时stackSize将>2
//进入真正的排序方法
mergeAt(n);
} else {
break; // Invariant is established
}
}
}
private void mergeAt(int i) {
assert stackSize >= 2;
assert i >= 0;
assert i == stackSize - 2 || i == stackSize - 3;
int base1 = runBase[i];
int len1 = runLen[i];
int base2 = runBase[i + 1];
int len2 = runLen[i + 1];
assert len1 > 0 && len2 > 0;
assert base1 + len1 == base2;
//提前将合并后的分组的长度更新到待排栈中
runLen[i] = len1 + len2;
if (i == stackSize - 3) {
runBase[i + 1] = runBase[i + 2];
runLen[i + 1] = runLen[i + 2];
}
stackSize--;//注意,这里stackSize自减一了
//找到run2第一个元素在run1中的位置,run1这个位置之前的元素无需参与排序,因为他们是最小的,在合并后的分组中肯定还在这些位置,因此实际上run1参与排序的base应该是base1+k,len是len1-k
int k = gallopRight(a[base2], a, base1, len1, 0, c);
assert k >= 0;
base1 += k;
len1 -= k;
if (len1 == 0)
return;
//找到run1最后一个元素在run2中的位置,run2这个位置之后的元素无需参与排序,因为它们在是最大的,合并后的分组中也肯定还在这些位置,因此实际上run2参与排序的len是执行gallopLeft后的len2,base还是base2
len2 = gallopLeft(a[base1 + len1 - 1], a, base2, len2, len2 - 1, c);
assert len2 >= 0;
if (len2 == 0)
return;
// Merge remaining runs, using tmp array with min(len1, len2) elements
if (len1 <= len2)
mergeLo(base1, len1, base2, len2);
else
mergeHi(base1, len1, base2, len2);
}
//先找出key分布在右序列中的哪个区间,再在这个区间中查找确切位置相比较逐个比较可以减少比较的次数,提高速度
//值得注意的是返回的ofs的指向为key<a[keyIndex+ofs],
//因此当ofs=1的时候表明run2中的所有元素都比key要大,没有元素需要跟run1交换
private static <T> int gallopRight(T key, T[] a, int base, int len,
int hint, Comparator<? super T> c) {
assert len > 0 && hint >= 0 && hint < len;
int ofs = 1;
int lastOfs = 0;
//采用二分选数快速定位区间的方法,找到区间a[keyIndex+lastOfs] <= key < a[keyIndex+Ofs]
//如果key小于右序列最大的元素,则表明所求区间应该从右往左更容易得到
//即base+hint为参照求得[keyIndex+hint - ofs] <= key < a[keyIndex+hint - lastOfs]
if (c.compare(key, a[base + hint]) < 0) {
int maxOfs = hint + 1;//gallopRight中,maxOfs是Ofs的最大值,即最后能定位到的最小左顶点为base-1,也就是key本身
//在右序列中从右往左按ofs为长度选数定位
//若找到的左顶点仍大于key则按照二分选数继续定位直到找到以base+hint为参照的区间
while (ofs < maxOfs && c.compare(key, a[base + hint - ofs]) < 0) {
lastOfs = ofs;//将找到的左顶点当作区间右顶点
ofs = (ofs << 1) + 1;//ofs按照2^m+1增加
if (ofs <= 0) // 若ofs溢出则ofs=maxOfs
ofs = maxOfs;
}
if (ofs > maxOfs)
ofs = maxOfs;
//把lastOf和ofs转换成以key为参照的区间,即左顶点为keyIndex+lastOfs,右顶点为keyIndex+ofs
int tmp = lastOfs;
lastOfs = hint - ofs;
ofs = hint - tmp;
//如果key等于右序列最大的元素,则表明所求区间从左往右更容易求得
//即base为参照求得a[keyIndex+ lastOfs] <= key < a[keyIndex+ ofs]
} else {
int maxOfs = len - hint;
while (ofs < maxOfs && c.compare(key, a[base + hint + ofs]) >= 0) {
lastOfs = ofs;
ofs = (ofs << 1) + 1;
if (ofs <= 0) // int overflow
ofs = maxOfs;
}
if (ofs > maxOfs)
ofs = maxOfs;
把lastOf和ofs转换成以key为参照的区间,即左顶点为keyIndex+lastOfs,右顶点为keyIndex+ofs
lastOfs += hint;
ofs += hint;
}
assert -1 <= lastOfs && lastOfs < ofs && ofs <= len;
//二分查找,找到key在右序列的确切位置
lastOfs++;
while (lastOfs < ofs) {
int m = lastOfs + ((ofs - lastOfs) >>> 1);
if (c.compare(key, a[base + m]) < 0)
ofs = m; // key < a[b + m]
else
lastOfs = m + 1; // a[b + m] <= key
}
assert lastOfs == ofs; // so a[b + ofs - 1] <= key < a[b + ofs]
return ofs;
}
private void mergeLo(int base1, int len1, int base2, int len2) {
assert len1 > 0 && len2 > 0 && base1 + len1 == base2;
// Copy first run into temp array
T[] a = this.a; // For performance
T[] tmp = ensureCapacity(len1);
int cursor1 = tmpBase; // Indexes into tmp array
int cursor2 = base2; // Indexes int a
int dest = base1; // Indexes int a
System.arraycopy(a, base1, tmp, cursor1, len1);
//如果右区间可交换的元素为零,则不需排序,直接将左区间的元素写回原数组即可
a[dest++] = a[cursor2++];
if (--len2 == 0) {//len2=1
System.arraycopy(tmp, cursor1, a, dest, len1);
return;
}
//原因相似于右区间,参考上面的解释
if (len1 == 1) {
System.arraycopy(a, cursor2, a, dest, len2);
a[dest + len2] = tmp[cursor1]; // Last elt of run 1 to end of merge
return;
}
Comparator<? super T> c = this.c; // Use local variable for performance
int minGallop = this.minGallop; // " " " " "
outer:
while (true) {
int count1 = 0; // Number of times in a row that first run won
int count2 = 0; // Number of times in a row that second run won
//对两个区间内的元素进行逐一排序
do {
assert len1 > 1 && len2 > 0;
if (c.compare(a[cursor2], tmp[cursor1]) < 0) {
a[dest++] = a[cursor2++];
count2++;
count1 = 0;
if (--len2 == 0)
break outer;
} else {
a[dest++] = tmp[cursor1++];
count1++;
count2 = 0;
if (--len1 == 1)
break outer;
}
} while ((count1 | count2) < minGallop);
//当run1或run2中连续出现不少于minGallop个元素都在比较中胜出,表明该这些元素构成的序列中的后续元素都将胜出,因此不应逐个比较,而是用gallop方法找出这个序列并整体交换。
do {
assert len1 > 1 && len2 > 0;
count1 = gallopRight(a[cursor2], tmp, cursor1, len1, 0, c);
if (count1 != 0) {
System.arraycopy(tmp, cursor1, a, dest, count1);
dest += count1;
cursor1 += count1;
len1 -= count1;
if (len1 <= 1) // len1 == 1 || len1 == 0
break outer;
}
a[dest++] = a[cursor2++];
if (--len2 == 0)
break outer;
count2 = gallopLeft(tmp[cursor1], a, cursor2, len2, 0, c);
if (count2 != 0) {
System.arraycopy(a, cursor2, a, dest, count2);
dest += count2;
cursor2 += count2;
len2 -= count2;
if (len2 == 0)
break outer;
}
a[dest++] = tmp[cursor1++];
if (--len1 == 1)
break outer;
minGallop--;//没出现一次整体交换,触发整体交换的minGallop阙值自减一,原因在于出现一次,那么很可能出现多次,因此要更快地检测到这些序列,减少不必要的开销。
} while (count1 >= MIN_GALLOP | count2 >= MIN_GALLOP);
if (minGallop < 0)
minGallop = 0;
minGallop += 2; // Penalize for leaving gallop mode
} // End of "outer" loop
this.minGallop = minGallop < 1 ? 1 : minGallop; // Write back to field
//特殊情况
//作为交换工作主要空间的run1如果没有元素需要交换而run2还有元素存在,那么只需将这些元素直接放置到a中即可
if (len1 == 1) {
assert len2 > 0;
System.arraycopy(a, cursor2, a, dest, len2);
a[dest + len2] = tmp[cursor1]; // Last elt of run 1 to end of merge
} else if (len1 == 0) {
//数组越界
throw new IllegalArgumentException(
"Comparison method violates its general contract!");
} else {
//如果run2没有元素需要交换而run1还有元素存在,那么只需将这些元素直接放置到a中即可
assert len2 == 0;
assert len1 > 1;
System.arraycopy(tmp, cursor1, a, dest, len1);
}
}