关于千万笔String处理效能

al9是一个ArrayList但存有一千多万笔String数据所进行如下过滤工作会跑特久
怎么修改以下过滤工作可以加快处理(任何可以加快的方法都可以)String filter="";
Iterator<String> it = al9.iterator();
          Iterator itt = al9.iterator();
                while(itt.hasNext()){
                    String bom = (String)itt.next();
                    String[] boma = bom.split("\\$");
                    if(filter.equals("")){
                      Iterator it2 = al9.iterator();
                      while(it2.hasNext()){
                          String bom2 = (String)it2.next();
                          String[] bom2a = bom2.split("\\$");
                          if((bom2a[2].equals(boma[2])) && (bom2a[1].equals(boma[1])) && (bom2a[3].equals(boma[3]))){
                              bomal2.add(bom2);

                          }
                          //i01++;
                      }
                      filter=filter+boma[0]+"$"+boma[2]+"$"+boma[3]+";";
                    }else if(filter.contains(boma[2]+"$"+boma[3]+";")){

                    }else{
                        Iterator it2 = al9.iterator();
                      while(it2.hasNext()){
                          String bom2 = (String)it2.next();
                          String[] bom2a = bom2.split("\\$");
                          if((bom2a[2].equals(boma[2])) && (bom2a[1].equals(boma[1])) && (bom2a[3].equals(boma[3]))){
                              bomal2.add(bom2);

                          }
                          //i02++;
                      }
                      filter=filter+boma[2]+"$"+boma[3]+";";
                    }

                }

解决方案 »

免费领取超大流量手机卡，每月29元包185G流量+100分钟通话, 中国电信官方发货

千万级记录数，任何方法都会跑挺久，跟Spring没啥特定关系吧考虑代码改为分布式计算，用N个线程甚至直接来N台电脑，每个负责执行1/N的数据处理任务。具体可以参考Map/Reduce的实现。
建议
①是否可以用数据库端做这些事情
②如果在数据进入的时候，规范字符串格式，则只需要直接比较bom和boma两个字符串是否相等即可，无需split后逐一比较分量了
③是否可以在hashCode上下点功夫，提高算法速度
给做个格式化，还加了al9, bomal2两个list ArrayList<String> al9 = new ArrayList<String>();
ArrayList<String> bomal2 = new ArrayList<String>();
String filter = "";
Iterator<String> it = al9.iterator();
Iterator itt = al9.iterator();
while (itt.hasNext())
{
String bom = (String) itt.next();
String[] boma = bom.split("\\$");
if (filter.equals(""))
{
Iterator it2 = al9.iterator();
while (it2.hasNext())
{
String bom2 = (String) it2.next();
String[] bom2a = bom2.split("\\$");
if ((bom2a[2].equals(boma[2])) && (bom2a[1].equals(boma[1])) && (bom2a[3].equals(boma[3])))
{
bomal2.add(bom2); }
// i01++;
}
filter = filter + boma[0] + "$" + boma[2] + "$" + boma[3] + ";";
}
else if (filter.contains(boma[2] + "$" + boma[3] + ";"))
{ }
else
{
Iterator it2 = al9.iterator();
while (it2.hasNext())
{
String bom2 = (String) it2.next();
String[] bom2a = bom2.split("\\$");
if ((bom2a[2].equals(boma[2])) && (bom2a[1].equals(boma[1])) && (bom2a[3].equals(boma[3])))
{
bomal2.add(bom2); }
// i02++;
}
filter = filter + boma[2] + "$" + boma[3] + ";";
}
}
if (filter.equals(""))
{
}
else if (filter.contains(boma[2] + "$" + boma[3] + ";"))
{
}
else
{
}这段把第一个IF删除，直接
if (filter.contains(boma[2] + "$" + boma[3] + ";"))
{
}
else
{
}
这样逻辑的复杂性降低了一点，代码更好懂。
我已经把程序简化成这样了: public static void main(String[] args)
{
ArrayList<String> list = new ArrayList<String>();
ArrayList<String> bomal2 = new ArrayList<String>();
String filter = "";
for (String bom : list)
{
String[] bomArray = bom.split("\\$");
String key = bomArray[2] + "$" + bomArray[3] + ";";
if (filter.contains(key))
{
}
else
{
for (String bom2 : list)
{
String[] bom2Array = bom2.split("\\$");
if ((bom2Array[1].equals(bomArray[1])) && (bom2Array[2].equals(bomArray[2])) && (bom2Array[3].equals(bomArray[3])))
{
bomal2.add(bom2);
}
}
filter = filter + key;
}
}
}
性能肯定没有提升，但代码更精炼，更易读了。
楼主的程序逻辑是不是有问题:楼主程序是根据后三项判重复，有重复的则加入另一LIST。但没排除元素本身与自己相等的情况。
这里有个错误:
if (filter.contains(key))
filter里的值是 12$13;14$15;...
这样以后有 2$13;的也会 contains() 为 true, 这不知道是需求还是BUG
是BUG的话可以使filter初始为";", 比较时这样:
if (filter.contains(";" + key))
在key前加个分号。当然，楼主这样的做法必须保证值是不会有分号。
我觉得contains还是用hashSet快些
以下是我的测试程序，test4比test性参大概提升了15倍。public class Test
{
public static void main(String[] args)
{
final ArrayList<String> list = new ArrayList<String>(10000);
ArrayList<String> bomal2 = new ArrayList<String>(10000);
Random r = new Random();
for (int i = 0; i < 10000; i++)
{
String v = "$" + r.nextInt(100) + "$" + r.nextInt(100) + "$" + r.nextInt(100);
list.add(v);
} System.out.println("start ... ");
long start, end; start = System.currentTimeMillis();
test(list, bomal2);
end = System.currentTimeMillis();
System.out.println("\ntest spend: " + (end - start));
System.out.println("size: " + bomal2.size());
System.out.println(bomal2.toString()); bomal2.clear(); start = System.currentTimeMillis();
test2(list, bomal2);
end = System.currentTimeMillis();
System.out.println("\ntest2 spend: " + (end - start));
System.out.println("size: " + bomal2.size());
System.out.println(bomal2.toString());

bomal2.clear(); start = System.currentTimeMillis();
test3(list, bomal2);
end = System.currentTimeMillis();
System.out.println("\ntest3 spend: " + (end - start));
System.out.println("size: " + bomal2.size());
System.out.println(bomal2.toString()); bomal2.clear(); start = System.currentTimeMillis();
test4(list, bomal2);
end = System.currentTimeMillis();
System.out.println("\ntest4 spend: " + (end - start));
System.out.println("size: " + bomal2.size());
System.out.println(bomal2.toString());
} public static void test(final ArrayList<String> list, ArrayList<String> bomal2)
{
String filter = ";";
for (String bom : list)
{
String[] bomArray = bom.split("\\$");
String key = bomArray[2] + "$" + bomArray[3] + ";";
if (!filter.contains(";" + key))
{
for (String bom2 : list)
{
String[] bom2Array = bom2.split("\\$");
if ((bom2Array[1].equals(bomArray[1])) && (bom2Array[2].equals(bomArray[2]))
&& (bom2Array[3].equals(bomArray[3])))
{
bomal2.add(bom2);
}
}
filter = filter + key;
}
}
System.out.println(filter);
} public static void test2(final ArrayList<String> list, ArrayList<String> bomal2)
{
Set<String> filter = new HashSet<String>(list.size() / 2);
for (String bom : list)
{
String[] bomArray = bom.split("\\$");
String key = bomArray[2] + "$" + bomArray[3];
if (!filter.contains(key))
{
for (String bom2 : list)
{
String[] bom2Array = bom2.split("\\$");
if ((bom2Array[1].equals(bomArray[1])) && (bom2Array[2].equals(bomArray[2]))
&& (bom2Array[3].equals(bomArray[3])))
{
bomal2.add(bom2);
}
}
filter.add(key);
}
}
System.out.println(filter);
} public static void test3(final ArrayList<String> list, ArrayList<String> bomal2)
{
Set<String> filter = new HashSet<String>(list.size() / 2);
for (String bom : list)
{
int bomIdx1 = bom.indexOf("$");
int bomIdx2 = bom.indexOf("$", bomIdx1 + 1);
String key = bom.substring(bomIdx2 + 1);
if (!filter.contains(key))
{
for (String bom2 : list)
{
if (bom2.endsWith(bom.substring(bomIdx1)))
{
bomal2.add(bom2);
}
}
filter.add(key);
}
}
System.out.println(filter);
} public static void test4(final ArrayList<String> list, ArrayList<String> bomal2)
{
Set<String> filter = new HashSet<String>(list.size() / 2);
for (int i = 0; i < list.size(); i++)
{
String bom = list.get(i);
int bomIdx1 = bom.indexOf("$");
int bomIdx2 = bom.indexOf("$", bomIdx1 + 1);
String key = bom.substring(bomIdx2 + 1);
if (!filter.contains(key))
{
for (int j = i; j < list.size(); j++)
{
String bom2 = list.get(j);
if (bom2.endsWith(bom.substring(bomIdx1)))
{
bomal2.add(bom2);
}
}
filter.add(key);
}
}
System.out.println(filter);
}
}以下是我的测试结果start ... test spend: 10842
size: 6350test2 spend: 10187
size: 6350test3 spend: 1321
size: 6350test4 spend: 672
size: 6350
HashSet<String> filter = new HashSet<String>();
for(String bom : al9){
String[] boma = bom.split("\\$");
if(!filter.contains(boma[2] + "$" + boma[3])){
for(String bom2 : al9){
String[] bom2a = bom2.split("\\$");
if ((bom2a[2].equals(boma[2]))
&& (bom2a[1].equals(boma[1]))
&& (bom2a[3].equals(boma[3]))) {
bomal2.add(bom2);
}
}
filter.add(boma[2] + "$" + boma[3]);
}
}
public static void main(String[] args)
    {
        ArrayList<String> list = new ArrayList<String>();
        ArrayList<String> bomal2 = new ArrayList<String>();
        String filter = "";
        for (String bom : list)
        {
            String[] bomArray = bom.split("\\$");
            String key = bomArray[2] + "$" + bomArray[3] + ";";
            if (filter.contains(key))
            {
            }
            else
            {
                for (String bom2 : list)
                {
                    String[] bom2Array = bom2.split("\\$");
                    if ((bom2Array[1].equals(bomArray[1])) && (bom2Array[2].equals(bomArray[2])) && (bom2Array[3].equals(bomArray[3])))
                    {
                        bomal2.add(bom2);
                    }
                }
                filter = filter + key;
            }
        }
    }
也就是在一个千万笔数据中找出 boma[1],boma[2],boma[3]相同的留下其他都删除
看你这几天连续发贴，都是问此类相关问题，应该是工作上的问题吧。客观地说，自己开发这种代码投资收益比太低，因为涉及较多环节的工作量，如果你不是为了锻炼自己能力的话，建议：
1、将数据导入Oracle中，千万级数据，导入并不会太久，应该不会超过1个小时。
2、用Select ... Group By ... Having 一句话得到你要的结果集，千万级记录对Oracle来说不算太大，预先建好索引，然后打开并发计算参数。
感谢你的建议但对把逻辑转成sql不太熟悉
语句可能比你想象的要简单多，你可以用数据库弄个7、8行简单模拟下试试看。按照你的要求：“也就是在一个千万笔数据中找出 boma[1],boma[2],boma[3]相同的留下其他都删除”。
假设将这三个数，存入数据库的字段名分别为： fa, fb, fc，表名为 tab
你需要的SQL如下：Select fa, fb, fc, count(*) As num     /* num 是重复出现的次数*/
From tab
Group By fa, fb, fc  /* 分组 */
Having count(*) > 1  /* 出现超过1次的才算 */That's all. 当然，如果你确实是为了锻炼自己的设计开发能力那是另当别论了。
说真的，这个问题是个好问题；
但能力有限，对算法和数据结构知识匮乏；
对Java-正则表达式知识也很肤浅，出于学习Java-正则表达式的目的；改写了下；
测试结果  速度快了一点，不知道内存怎么样(性能不知道是否考虑空间)； static String charReguExp = "(([a-zA-Z]*\\$)([a-zA-Z]*\\$([a-zA-Z]*\\$[a-zA-Z]*)\\$)[a-zA-Z]*)";

static Pattern pattern = Pattern.compile(charReguExp);
static StringBuffer sb = new StringBuffer("");

// static List<String> al9 = Arrays.asList("aaa$bbb$ccc$ddd$",
// "bbb$hhh$ccc$ddd$ttt",
// "ttt$jj$nnn$ddd$ooo",
// "bbb$hhh$cc$ddd$ttt");
static List<String> al9 = new ArrayList<String>();
static{
for(int i = 1; i <= 5000000; i++){
for(int j = 1; j <= 4; j++){
al9.add("aaa$bbb$ccc$ddd$");
al9.add("bbb$hhh$ccc$ddd$ttt");
al9.add("ttt$jj$nnn$ddd$ooo");
al9.add("bbb$hhh$cc$ddd$ttt");
}
}
}

public static void main(String[] args) {
prossessByRegEx(al9);
System.out.println("============");
process(al9);
}

public static void prossessByRegEx(List<String> al9){
long start = System.currentTimeMillis();

ArrayList<String> bomal2 = new ArrayList<String>();
StringBuffer filter = new StringBuffer("");

addValueForList(al9.get(0), al9, bomal2);
filter.append(getString(al9.get(0), 2, 4));

for(int i = 1; i < al9.size(); i++){
if(!contains(filter.toString(), al9.get(i))){
addValueForList(al9.get(i), al9, bomal2);
filter.append(getString(al9.get(i), 4));
}
}
long end = System.currentTimeMillis();

System.out.println(end - start);
System.out.println(bomal2.size());
// for(String str : bomal2){
// System.out.println(str);
// }
System.out.println(filter.toString());
} private static String getString(String strValue, int... groups){
sb.delete(0, sb.length());
Matcher regexMatcher = pattern.matcher(strValue);
if(regexMatcher.find()){
for(int i : groups){
sb.append(regexMatcher.group(i));
}
}
if(sb.length() > 0){
sb.append(";");
}
return sb.toString();
}

private static boolean contains(String filter, String strValue){
return filter.contains(getString(strValue, 4));
}

private static void addValueForList(String strValue,List<String> al9, List<String> bomal2){
String temp = getString(strValue, 3);
for(String val : al9){
if(temp.equals(getString(val, 3))){
bomal2.add(val);
}
}
}

public static void process(List<String> al9) {
long start = System.currentTimeMillis();

ArrayList<String> bomal2 = new ArrayList<String>();

String filter = "";
Iterator<String> it = al9.iterator();
Iterator itt = al9.iterator();
while (itt.hasNext()) {
String bom = (String) itt.next();
String[] boma = bom.split("\\$");
if (filter.equals("")) {
Iterator it2 = al9.iterator();
while (it2.hasNext()) {
String bom2 = (String) it2.next();
String[] bom2a = bom2.split("\\$");
if ((bom2a[2].equals(boma[2]))
&& (bom2a[1].equals(boma[1]))
&& (bom2a[3].equals(boma[3]))) {
bomal2.add(bom2); }
// i01++;
}
filter = filter + boma[0] + "$" + boma[2] + "$" + boma[3] + ";";
} else if (filter.contains(boma[2] + "$" + boma[3] + ";")) { } else {
Iterator it2 = al9.iterator();
while (it2.hasNext()) {
String bom2 = (String) it2.next();
String[] bom2a = bom2.split("\\$");
if ((bom2a[2].equals(boma[2]))
&& (bom2a[1].equals(boma[1]))
&& (bom2a[3].equals(boma[3]))) {
bomal2.add(bom2); }
// i02++;
}
filter = filter + boma[2] + "$" + boma[3] + ";";
} }
long end = System.currentTimeMillis();

System.out.println(end - start);
System.out.println(bomal2.size());
// for(String str : bomal2){
// System.out.println(str);
// }
System.out.println(filter);
}
看看这个代码，能否完成楼主要求 public static void doService(ArrayList<String> al9, Collection<String> bomal2) {
HashMap<String,Integer> counter = new HashMap<String,Integer>();
StringBuilder buff = new StringBuilder();
for(String bom : al9){
String[] boma = bom.split("\\$");
buff.setLength(0);
buff.append(boma[1]).append('$').append(boma[2]).append('$').append(boma[3]);
String key = buff.toString();
if(counter.get(key)==null){
counter.put(key, Integer.valueOf(1));
}else{
counter.put(key, counter.get(key)+1);
}
}
for(String bom : al9){
String[] boma = bom.split("\\$");
buff.setLength(0);
buff.append(boma[1]).append('$').append(boma[2]).append('$').append(boma[3]);
String key = buff.toString();
int c = counter.get(key);
if(c>1){
bomal2.add(bom);
}
}
}
楼主，你的千万级别数据量，究竟是多少千万？每个字符串究竟多长？我算了下，如果是壹仟万行，每行20个字符的话，总容量是 200MB。甚至总容量到 1000MB，都能接受。如果总数据量仅仅是这个数量级，我认为用22楼的方法是可行的，消耗时间不会完全无法接受，只是需要把JVM的内存配大点就行了。
有1800万笔每个字符串有100~200个字因为是各字段相加
用java -Xmx30000m 运行看系统监视该进程达到13xxxM的使用量本服务器有49G的内存
后来我发现可以在数据库中获取数据时直接筛选会比较节省时间但这个讨论还是帮了我很多
也谢谢弟兄们给的建议让我在第一次对千万级数据量处理完成