求个代码：1000万行长度为20~30之间的字符串，删除其中重复的行

            string reditnew = "";
                    Stopwatch watch = new Stopwatch();
                    watch.Start();                string[] alldit= File.ReadAllLines("d:\\alldit.txt");            foreach(string tmpdit in alldit)
            {
                if (!reditnew.Contains(tmpdit))
                {
                    reditnew +=tmpdit;
                }
            }
            watch.Stop();
            textview.Text += ("用时:" + watch.ElapsedMilliseconds.ToString() + "毫秒\r\n");
            File.WriteAllText("d:\\newdit.txt", reditnew);


                    Console.Beep();
这是我现在的代码一个小时了也不见结束

        private void button1_Click(object sender, EventArgs e)
        {
            string reditnew = "";
            Stopwatch watch = new Stopwatch();
            watch.Start();            string[] alldit = File.ReadAllLines("d:\\alldit.txt");            foreach (string tmpdit in alldit)
            {
                if (!reditnew.Contains(tmpdit))
                {
                    reditnew += tmpdit + "\r\n";
                }
            }
            watch.Stop();
            textview.Text += ("用时:" + watch.ElapsedMilliseconds.ToString() + "毫秒\r\n");
            File.WriteAllText("d:\\newdit.txt", reditnew);
            Console.Beep();
}
这是我现在的代码，更正二楼的漏了换行

既然已经放入内存了，Contains有点太随意了。应该首先排序，然后简单地顺序扫描一遍就行了。

很简单，这是一个基本的算法复杂度概念。如果使用contains，那么就是平方数量级的。而首先排序，然后扫描一遍，只是n*(log(n)+1)数量级的，理论上快了70万倍的速度。当然这是假设数据全都在物理内存中，不需要大量使用虚拟内存的情况下。

[email protected]
文本文件发过来

        private void button1_Click(object sender, EventArgs e)
        {
            Stopwatch watch = new Stopwatch();
            watch.Start();
            string reditnew = "";
            string[] alldit = File.ReadAllLines("d:\\alldit.txt");
            Array.Sort(alldit);            for (int i = 0; i < alldit.Length - 1; i++)
            {
                if (alldit[i].Split('$')[0] == alldit[i + 1].Split('$')[0])
                {                    alldit[i] = "";                }
                else
                {
                    if (alldit[i].Length > 1)
                    {
                        reditnew += alldit[i] + "\r\n";
                    }
                }
            }
            reditnew += alldit[alldit.Length - 1] + "\r\n";
            watch.Stop();
            textview.Text += ("用时:" + watch.ElapsedMilliseconds.ToString() + "毫秒\r\n");
            File.WriteAllText("d:\\newdit.txt", reditnew);
            Console.Beep();新代码，请帮忙改进

        private void button1_Click(object sender, EventArgs e)
        {
            Stopwatch watch = new Stopwatch();
            watch.Start();
            string reditnew = "";
            string[] alldit = File.ReadAllLines("d:\\alldit.txt");
            Array.Sort(alldit);            for (int i = 0; i < alldit.Length - 1; i++)
            {
                if (alldit[i].Split('$')[0] == alldit[i + 1].Split('$')[0])
                {                    alldit[i] = "";                }
                else
                {
                    if (alldit[i].Length > 1)
                    {
                        reditnew += alldit[i] + "\r\n";
                    }
                }
            }
            reditnew += alldit[alldit.Length - 1] + "\r\n";
            watch.Stop();
            textview.Text += ("用时:" + watch.ElapsedMilliseconds.ToString() + "毫秒\r\n");
            File.WriteAllText("d:\\newdit.txt", reditnew);
            Console.Beep();
}
这个再帮忙再改进下效率也不知道怎么再随机打乱排序

另外，你的拼接字符串相当耗时间。应该使用StringBuild对象实例来组合字符串然后再仅仅转换一次字符串，或者直接输出给结果去处理。

我给你写了个例子，随机产生10000个字符串（长度大致都不长于15个字节），你可以看看用时结果：using System;
using System.Diagnostics;
using System.Linq;
using System.Text;namespace ConsoleApplication1
{
    class Program
    {
        static void Main(string[] args)
        {
            string[] alldit = CreateDatas(100000);
            var copy = new string[alldit.Length];
            Array.Copy(alldit, copy, alldit.Length);
            test10(alldit);
            test1(copy);
            Console.ReadKey();
        }        private static void test10(string[] alldit)
        {
            Array.Sort(alldit);
            var reditnew = new StringBuilder();
            Stopwatch watch = new Stopwatch();
            watch.Start();
            string last = null;
            foreach (string tmpdit in alldit)
            {
                if (last == null || tmpdit != last)
                {
                    reditnew.AppendLine(tmpdit);
                    last = tmpdit;
                }
            }
            string result = reditnew.ToString();
            watch.Stop();
            Console.WriteLine("用时:" + watch.ElapsedMilliseconds.ToString() + "毫秒\r\n");
        }        private static string[] CreateDatas(int p)  //随机产生p和字符串
        {
            var Rnd = new Random();
            return (from n in Enumerable.Range(1, p)
                    select Rnd.Next().ToString()).ToArray();
        }        private static void test1(string[] alldit)
        {
            string reditnew = "";
            Stopwatch watch = new Stopwatch();
            watch.Start();
            foreach (string tmpdit in alldit)
            {
                if (!reditnew.Contains(tmpdit))
                {
                    reditnew += tmpdit + "\r\n";
                }
            }
            watch.Stop();
            Console.WriteLine("用时:" + watch.ElapsedMilliseconds.ToString() + "毫秒\r\n");
        }
    }
}

private void button1_Click(object sender, EventArgs e)
        {
            Stopwatch watch = new Stopwatch();
            watch.Start();
            string reditnew = "";
            StringBuilder newalldit = new StringBuilder();
            string[] alldit = File.ReadAllLines("d:\\alldit.txt");
            Array.Sort(alldit);            for (int i = 0; i < alldit.Length - 1; i++)
            {
                if (alldit[i].Split('$')[0] == alldit[i + 1].Split('$')[0])
                {                    alldit[i] = "";                }
                else
                {
                    if (alldit[i].Length > 1)
                    {
                       // reditnew += alldit[i] + "\r\n";
                        newalldit.AppendLine(alldit[i]);
                    }
                }
            }
            //reditnew += alldit[alldit.Length - 1] + "\r\n";
            newalldit.AppendLine(alldit[alldit.Length - 1]);
            watch.Stop();
            textview.Text += ("用时:" + watch.ElapsedMilliseconds.ToString() + "毫秒\r\n");
            reditnew = newalldit.ToString();
            File.WriteAllText("d:\\newdit.txt", reditnew);
            Console.Beep();
}
再次改进。性能OK了。。麻烦帮忙教我怎么打乱排序。谢谢

嗯，demo代码重用了10万个字符串，而不是1万个。其实这几乎没有做什么优化，只是用了一些常识编写出最简单的程序。

嗯。。比较菜，不同代码的性能不懂。。习惯用str+=这些比较真观的
请教我下现在怎么打乱他的排序

当然也有其它写法，比如创建Dictionary<string,int>对象，使用字符串作为键，在文件中的行数作为值，然后遍历每一个字符串并尝试插入这个字典里（这时候可以使用contains来判断是否重复），然后可以输出这个对象的Keys数组，还可以使用Values数组做到按照原来在文件中的顺序输出。

因为Dictionary<K,T>是按照hash方式来查找重复的，查找重复也会比较快。

        public static T[] RandomSort<T>(T[] array)
        {
            int len = array.Length;
            System.Collections.Generic.List<int> list = new System.Collections.Generic.List<int>();
            T[] ret = new T[len];
            Random rand = new Random();
            int i = 0;
            while (list.Count < len)
            {
                int iter = rand.Next(0, len);
                if (!list.Contains(iter))
                {
                    list.Add(iter);
                    ret[i] = array[iter];
                    i++;
                }            }
            return ret;
        }
找着了。。结帖

对于仅仅是查询重复的，显然使用hash是比排序快更多的。我修改一下测试demo：using System;
using System.Diagnostics;
using System.Linq;
using System.Text;
using System.Collections.Generic;namespace ConsoleApplication1
{
    class Program
    {
        static void Main(string[] args)
        {
            string[] alldit = CreateDatas(100000);
            var copy = new string[alldit.Length];
            Array.Copy(alldit, copy, alldit.Length);
            test10(alldit);
            test1(copy);
            Console.ReadKey();
        }        private static void test10(string[] alldit)
        {
            Array.Sort(alldit);
            var reditnew = new StringBuilder();
            Stopwatch watch = new Stopwatch();
            watch.Start();
            string last = null;
            foreach (string tmpdit in alldit)
            {
                if (last == null || tmpdit != last)
                {
                    reditnew.AppendLine(tmpdit);
                    last = tmpdit;
                }
            }
            string result = reditnew.ToString();
            watch.Stop();
            Console.WriteLine("用时:" + watch.ElapsedMilliseconds.ToString() + "毫秒\r\n");
        }        private static string[] CreateDatas(int p)  //随机产生p和字符串
        {
            var Rnd = new Random();
            return (from n in Enumerable.Range(1, p)
                    select Rnd.Next().ToString()).ToArray();
        }        private static void test1(string[] alldit)
        {
            var reditnew = new StringBuilder();
            Stopwatch watch = new Stopwatch();
            watch.Start();
            Dictionary<string, int> dict = new Dictionary<string, int>();
            foreach (string tmpdit in alldit)
            {
                if (dict.ContainsKey(tmpdit))
                {
                    reditnew.AppendLine(tmpdit);
                }
            }
            var result = reditnew.ToString();
            watch.Stop();
            Console.WriteLine("用时:" + watch.ElapsedMilliseconds.ToString() + "毫秒\r\n");
        }
    }
}
后边那个新的测试使用的是hash字典。而且它也是按照原来文件中的次序排的，无需“打乱”次序。

调试易

求个代码：1000万行长度为20~30之间的字符串，删除其中重复的行

解决方案 »