如果不是的话,你如何能在重构时,找出 100 个值中,哪个最小呢?因为开销主要在这里,尤其是如果原始数据恰好是从 1 ~ 1亿 的顺序生成的情况,意味着重构会调用近1亿次。我自己简单写了个程序测试了下,大致效果是: 生成随机数: 3946ms public static int[] genRandom(int num) { int[] ret = new int[num]; Random rand = new Random(); while (num-- > 0) ret[num] = rand.nextInt(); return ret; }啥都不干循环一次的开销: 363ms public static void iterator(int[] orgs) { for (int i = 0; i < orgs.length; i++) orgs[i] = orgs[i]; }获取Top100的开销: 514ms ... 因为大家的电脑性能不一样,所以这样就有个性能比较,大致是遍历一遍的 1.416 倍。
试过快排吗?C#随机数据在0.3~0.8s之间 CPU E5800 单线程
细节优化以后在0.1~0.42s之间,最慢的是逆序0.42s,随机数据一般在0.2s左右
1亿个数在内存中? 1亿个数就int型来说也400M了哦,楼主有想过IO问题么?
针对性优化快排以后在0.08~0.28s之间,最慢的是逆序0.28s,随机数据基本在0.08s左右
求的最小,最大需要把逻辑处理反过来 private unsafe struct rangeSorter { public int* skipCount; public int* getEndIndex; public void sort(int* startIndex, int* endIndex) { do { int leftValue = *startIndex, rightValue = *endIndex, averageIndex = (int)(endIndex - startIndex) >> 1; if (averageIndex == 0) { if (leftValue > rightValue) { *startIndex = rightValue; *endIndex = leftValue; } break; } int* average = startIndex + averageIndex, leftIndex = startIndex, rightIndex = endIndex; int value = *average; if (leftValue <= value) { if (value > rightValue) { *rightIndex = value; if (leftValue <= rightValue) *average = value = rightValue; else { *leftIndex = rightValue; *average = value = leftValue; } } } else if (leftValue <= rightValue) { *leftIndex = value; *average = value = leftValue; } else { *rightIndex = leftValue; if (value <= rightValue) { *leftIndex = value; *average = value = rightValue; } else *leftIndex = rightValue; } ++leftIndex; --rightIndex; do { while (*leftIndex < value) ++leftIndex; while (value < *rightIndex) --rightIndex; if (leftIndex < rightIndex) { leftValue = *leftIndex; *leftIndex = *rightIndex; *rightIndex = leftValue; } else { if (leftIndex == rightIndex) { ++leftIndex; --rightIndex; } break; } } while (++leftIndex <= --rightIndex); if (rightIndex - startIndex <= endIndex - leftIndex) { if (startIndex < rightIndex && rightIndex >= skipCount) sort(startIndex, rightIndex); if (leftIndex > getEndIndex) break; startIndex = leftIndex; } else { if (leftIndex < endIndex && leftIndex <= getEndIndex) sort(leftIndex, endIndex); if (rightIndex < skipCount) break; endIndex = rightIndex; } } while (startIndex < endIndex); } } private static unsafe int[] topLess(int[] values, int count) { uint sqrtMod; int length = Math.Min(Math.Max(count << 2, count + (int)showjim.sys.number.sqrt((uint)values.Length, out sqrtMod)), values.Length); int[] value = new int[length]; fixed (int* writeFixed = value, valueFixed = values) { int* writeStart = writeFixed + count, writeMax = writeStart, write = writeStart, writeEnd = writeFixed + length - 1; rangeSorter sort = new rangeSorter { skipCount = --writeMax, getEndIndex = writeStart }; showjim.sys.memory.copyUnsafe(valueFixed, writeFixed, length << 2); sort.sort(writeFixed, writeEnd); int maxValue = *writeMax; for (int* start = valueFixed + length, end = valueFixed + values.Length; start != end; ++start) { if (*start < maxValue) { *write = *start; if (write == writeEnd) { sort.sort(writeFixed, writeEnd); write = writeStart; maxValue = *writeMax; } else ++write; } } if (write != writeStart) sort.sort(writeFixed, write - 1); } return value.sub(0, count).toArrayUnsafe(); }
我的机子看来不行啊。初次运行4949ms,关了一些程序后在4820-4860ms左右。
无聊写了个算法 我机器上跑一次差不多0.37秒 LZ的要4秒多 大家看看有没有什么问题。import java.util.Arrays; import java.util.Random;public class MyTop100 { static final int TOTAL = 10000000;//1亿个数 static final int MAX = Integer.MAX_VALUE;//随机数最大值 static final int SIZE = 100;//前100个最大数 static final int[] TOPS = new int[SIZE];//存放最大值的数组
public static void main(String[] args) { long start = System.nanoTime();//开始 find(TOTAL, SIZE);//获取最大数 System.out.println(Arrays.toString(TOPS));//打印 long end = System.nanoTime();//结束 System.out.println("time = " + (end - start) / 1e9 + " seconds");//输出时间 }
static void find(int total,int size){ int num; Random ran = new Random(); //前100个数直接存入数组 for(int i = 0;i < size;++i){ num = ran.nextInt(MAX); TOPS[i] = num; } setMin(TOPS); for(int i = size;i < total;++i){ //如果当前数比arr[0]大,将arr[0]替换为num,再获取最小值放入arr[0] num = ran.nextInt(MAX); if(num > TOPS[0]){ TOPS[0] = num; setMin(TOPS); } }
}
//获取当前最小值,放入arr[0] static void setMin(int[] arr){ int min = arr[0]; int index = 0; for(int i = 1;i < arr.length;i++){ if(arr[i] < min){ min = arr[i]; index = i; } } int temp = arr[0]; arr[0] = arr[index]; arr[index] = temp; } }
改写了下楼主的代码,改成javascript版本,可在node.js上执行 基本原封不动,可以看到连注释都保留了,我直接把楼主的 .java文件改成了.js,改了改函数变量声明和几个小的地方,算法完全相同执行结果: 本人笔记本(渣,core2 2.2GHz,2G内存,132个进程 ) node中:2700ms 到 2900ms之间 chrome浏览器:2100ms 到 2500ms(我也很诧异,chrome里比node里还要快,看来node做的还差很多啊) function main() { find(); } function find() {// var number = 100000000;// 一亿个数 var maxnum = 1000000000;// 随机数最大值 var i = 0; var topnum = 100;// 取最大的多少个
var startTime = Date.now(); var top = new Array(topnum); for (i = 0; i < topnum; i++) { top[i] =(Math.random()*maxnum)>>0;//设置为随机数 } buildHeap(top, 0, top.length);// 构建最小堆, top[0]为最小元素 for (i = topnum; i < number; i++) { var currentNumber2 =(Math.random()*maxnum)>>0;//设置为随机数 // 大于 top[0]则交换currentNumber2 重构最小堆 if (top[0] < currentNumber2) { top[0] = currentNumber2; shift(top, 0, top.length, 0); // 构建最小堆 top[0]为最小元素 } } console.log(top.join(",")); sort(top); console.log(top.join(","));
var endTime = Date.now(); console.log("Time is:",endTime-startTime," ms.")
} //构造排序数组 function buildHeap(array,from,len) { var pos = (len - 1) / 2; for (var i = pos; i >= 0; i--) { shift(array, from, len, i); } }
/** * @param array top数组 * @param from 开始 * @param len 数组长度 * @param pos 当前节点index * */ function shift(array,from,len,pos) { // 保存该节点的值 var tmp = array[from + pos]; var index = pos * 2 + 1;// 得到当前pos节点的左节点 while (index < len)// 存在左节点 { if (index + 1 < len && array[from + index] > array[from + index + 1])// 如果存在右节点 { // 如果右边节点比左边节点小,就和右边的比较 index += 1; }
我觉得他的重构就是将100个元素重新排列一下 将最小的还放到ary[0] 不知对否。。
“然后重构将数组最小值放在ary[0]”
这个过程是否要遍历这个数组,也就是完整循环一次?
如果不是的话,你如何能在重构时,找出 100 个值中,哪个最小呢?因为开销主要在这里,尤其是如果原始数据恰好是从 1 ~ 1亿 的顺序生成的情况,意味着重构会调用近1亿次。我自己简单写了个程序测试了下,大致效果是:
生成随机数: 3946ms
public static int[] genRandom(int num) {
int[] ret = new int[num];
Random rand = new Random();
while (num-- > 0) ret[num] = rand.nextInt();
return ret;
}啥都不干循环一次的开销: 363ms
public static void iterator(int[] orgs) {
for (int i = 0; i < orgs.length; i++) orgs[i] = orgs[i];
}获取Top100的开销: 514ms
...
因为大家的电脑性能不一样,所以这样就有个性能比较,大致是遍历一遍的 1.416 倍。
CPU E5800 单线程
1亿个数就int型来说也400M了哦,楼主有想过IO问题么?
private unsafe struct rangeSorter
{
public int* skipCount;
public int* getEndIndex;
public void sort(int* startIndex, int* endIndex)
{
do
{
int leftValue = *startIndex, rightValue = *endIndex, averageIndex = (int)(endIndex - startIndex) >> 1;
if (averageIndex == 0)
{
if (leftValue > rightValue)
{
*startIndex = rightValue;
*endIndex = leftValue;
}
break;
}
int* average = startIndex + averageIndex, leftIndex = startIndex, rightIndex = endIndex;
int value = *average;
if (leftValue <= value)
{
if (value > rightValue)
{
*rightIndex = value;
if (leftValue <= rightValue) *average = value = rightValue;
else
{
*leftIndex = rightValue;
*average = value = leftValue;
}
}
}
else if (leftValue <= rightValue)
{
*leftIndex = value;
*average = value = leftValue;
}
else
{
*rightIndex = leftValue;
if (value <= rightValue)
{
*leftIndex = value;
*average = value = rightValue;
}
else *leftIndex = rightValue;
}
++leftIndex;
--rightIndex;
do
{
while (*leftIndex < value) ++leftIndex;
while (value < *rightIndex) --rightIndex;
if (leftIndex < rightIndex)
{
leftValue = *leftIndex;
*leftIndex = *rightIndex;
*rightIndex = leftValue;
}
else
{
if (leftIndex == rightIndex)
{
++leftIndex;
--rightIndex;
}
break;
}
}
while (++leftIndex <= --rightIndex);
if (rightIndex - startIndex <= endIndex - leftIndex)
{
if (startIndex < rightIndex && rightIndex >= skipCount) sort(startIndex, rightIndex);
if (leftIndex > getEndIndex) break;
startIndex = leftIndex;
}
else
{
if (leftIndex < endIndex && leftIndex <= getEndIndex) sort(leftIndex, endIndex);
if (rightIndex < skipCount) break;
endIndex = rightIndex;
}
}
while (startIndex < endIndex);
}
}
private static unsafe int[] topLess(int[] values, int count)
{
uint sqrtMod;
int length = Math.Min(Math.Max(count << 2, count + (int)showjim.sys.number.sqrt((uint)values.Length, out sqrtMod)), values.Length);
int[] value = new int[length];
fixed (int* writeFixed = value, valueFixed = values)
{
int* writeStart = writeFixed + count, writeMax = writeStart, write = writeStart, writeEnd = writeFixed + length - 1;
rangeSorter sort = new rangeSorter { skipCount = --writeMax, getEndIndex = writeStart };
showjim.sys.memory.copyUnsafe(valueFixed, writeFixed, length << 2);
sort.sort(writeFixed, writeEnd);
int maxValue = *writeMax;
for (int* start = valueFixed + length, end = valueFixed + values.Length; start != end; ++start)
{
if (*start < maxValue)
{
*write = *start;
if (write == writeEnd)
{
sort.sort(writeFixed, writeEnd);
write = writeStart;
maxValue = *writeMax;
}
else ++write;
}
}
if (write != writeStart) sort.sort(writeFixed, write - 1);
}
return value.sub(0, count).toArrayUnsafe();
}
我机器上跑一次差不多0.37秒
LZ的要4秒多
大家看看有没有什么问题。import java.util.Arrays;
import java.util.Random;public class MyTop100 {
static final int TOTAL = 10000000;//1亿个数
static final int MAX = Integer.MAX_VALUE;//随机数最大值
static final int SIZE = 100;//前100个最大数
static final int[] TOPS = new int[SIZE];//存放最大值的数组
public static void main(String[] args) {
long start = System.nanoTime();//开始
find(TOTAL, SIZE);//获取最大数
System.out.println(Arrays.toString(TOPS));//打印
long end = System.nanoTime();//结束
System.out.println("time = " + (end - start) / 1e9 + " seconds");//输出时间
}
static void find(int total,int size){
int num;
Random ran = new Random();
//前100个数直接存入数组
for(int i = 0;i < size;++i){
num = ran.nextInt(MAX);
TOPS[i] = num;
}
setMin(TOPS);
for(int i = size;i < total;++i){
//如果当前数比arr[0]大,将arr[0]替换为num,再获取最小值放入arr[0]
num = ran.nextInt(MAX);
if(num > TOPS[0]){
TOPS[0] = num;
setMin(TOPS);
}
}
}
//获取当前最小值,放入arr[0]
static void setMin(int[] arr){
int min = arr[0];
int index = 0;
for(int i = 1;i < arr.length;i++){
if(arr[i] < min){
min = arr[i];
index = i;
}
}
int temp = arr[0];
arr[0] = arr[index];
arr[index] = temp;
}
}
我的机器在 200ms~4000ms之间,还算比较理想。
我的机器在 200ms~4000ms之间,还算比较理想。
基本原封不动,可以看到连注释都保留了,我直接把楼主的 .java文件改成了.js,改了改函数变量声明和几个小的地方,算法完全相同执行结果:
本人笔记本(渣,core2 2.2GHz,2G内存,132个进程 )
node中:2700ms 到 2900ms之间
chrome浏览器:2100ms 到 2500ms(我也很诧异,chrome里比node里还要快,看来node做的还差很多啊)
function main() {
find();
}
function find() {//
var number = 100000000;// 一亿个数
var maxnum = 1000000000;// 随机数最大值
var i = 0;
var topnum = 100;// 取最大的多少个
var startTime = Date.now();
var top = new Array(topnum);
for (i = 0; i < topnum; i++) {
top[i] =(Math.random()*maxnum)>>0;//设置为随机数
}
buildHeap(top, 0, top.length);// 构建最小堆, top[0]为最小元素
for (i = topnum; i < number; i++) {
var currentNumber2 =(Math.random()*maxnum)>>0;//设置为随机数
// 大于 top[0]则交换currentNumber2 重构最小堆
if (top[0] < currentNumber2) {
top[0] = currentNumber2;
shift(top, 0, top.length, 0); // 构建最小堆 top[0]为最小元素
}
}
console.log(top.join(","));
sort(top);
console.log(top.join(","));
var endTime = Date.now();
console.log("Time is:",endTime-startTime," ms.")
}
//构造排序数组
function buildHeap(array,from,len) {
var pos = (len - 1) / 2;
for (var i = pos; i >= 0; i--) {
shift(array, from, len, i);
}
}
/**
* @param array top数组
* @param from 开始
* @param len 数组长度
* @param pos 当前节点index
* */
function shift(array,from,len,pos) {
// 保存该节点的值
var tmp = array[from + pos];
var index = pos * 2 + 1;// 得到当前pos节点的左节点
while (index < len)// 存在左节点
{
if (index + 1 < len && array[from + index] > array[from + index + 1])// 如果存在右节点
{
// 如果右边节点比左边节点小,就和右边的比较
index += 1;
}
if (tmp > array[from + index]) {
array[from + pos] = array[from + index];
pos = index;
index = pos * 2 + 1;
} else {
break;
}
}
// 最终全部置换完毕后 ,把临时变量赋给最后的节点
array[from + pos] = tmp;
}
function sort(array){
for(var i = 0; i < array.length - 1; i++){
//当前值当作最小值
var min = array[i];
for(var j = i+1; j < array.length; j++){
if(min>array[j]){
//如果后面有比min值还小的就交换
min = array[j];
array[j] = array[i];
array[i] = min;
}
}
}
}
main();
都是C++标准库里面的,Java库里面应该也有对应的。
这不是最好算法。
要是采取算法导论第9章那个select算法,(我这写了个实现代码 http://blog.csdn.net/michealtx/article/details/7467485 )寻找第i大树时间复杂度最坏为O(m),然后运行100次或者1000次,则总的时间复杂度为O(n*m)。
欢迎拍砖指正!