al9是一个ArrayList但存有一千多万笔String数据所进行如下过滤工作会跑特久
怎么修改以下过滤工作可以加快处理(任何可以加快的方法都可以)String filter="";
Iterator<String> it = al9.iterator();
Iterator itt = al9.iterator();
while(itt.hasNext()){
String bom = (String)itt.next();
String[] boma = bom.split("\\$");
if(filter.equals("")){
Iterator it2 = al9.iterator();
while(it2.hasNext()){
String bom2 = (String)it2.next();
String[] bom2a = bom2.split("\\$");
if((bom2a[2].equals(boma[2])) && (bom2a[1].equals(boma[1])) && (bom2a[3].equals(boma[3]))){
bomal2.add(bom2);
}
//i01++;
}
filter=filter+boma[0]+"$"+boma[2]+"$"+boma[3]+";";
}else if(filter.contains(boma[2]+"$"+boma[3]+";")){
}else{
Iterator it2 = al9.iterator();
while(it2.hasNext()){
String bom2 = (String)it2.next();
String[] bom2a = bom2.split("\\$");
if((bom2a[2].equals(boma[2])) && (bom2a[1].equals(boma[1])) && (bom2a[3].equals(boma[3]))){
bomal2.add(bom2);
}
//i02++;
}
filter=filter+boma[2]+"$"+boma[3]+";";
}
}
怎么修改以下过滤工作可以加快处理(任何可以加快的方法都可以)String filter="";
Iterator<String> it = al9.iterator();
Iterator itt = al9.iterator();
while(itt.hasNext()){
String bom = (String)itt.next();
String[] boma = bom.split("\\$");
if(filter.equals("")){
Iterator it2 = al9.iterator();
while(it2.hasNext()){
String bom2 = (String)it2.next();
String[] bom2a = bom2.split("\\$");
if((bom2a[2].equals(boma[2])) && (bom2a[1].equals(boma[1])) && (bom2a[3].equals(boma[3]))){
bomal2.add(bom2);
}
//i01++;
}
filter=filter+boma[0]+"$"+boma[2]+"$"+boma[3]+";";
}else if(filter.contains(boma[2]+"$"+boma[3]+";")){
}else{
Iterator it2 = al9.iterator();
while(it2.hasNext()){
String bom2 = (String)it2.next();
String[] bom2a = bom2.split("\\$");
if((bom2a[2].equals(boma[2])) && (bom2a[1].equals(boma[1])) && (bom2a[3].equals(boma[3]))){
bomal2.add(bom2);
}
//i02++;
}
filter=filter+boma[2]+"$"+boma[3]+";";
}
}
①是否可以用数据库端做这些事情
②如果在数据进入的时候,规范字符串格式,则只需要直接比较bom和boma两个字符串是否相等即可,无需split后逐一比较分量了
③是否可以在hashCode上下点功夫,提高算法速度
ArrayList<String> bomal2 = new ArrayList<String>();
String filter = "";
Iterator<String> it = al9.iterator();
Iterator itt = al9.iterator();
while (itt.hasNext())
{
String bom = (String) itt.next();
String[] boma = bom.split("\\$");
if (filter.equals(""))
{
Iterator it2 = al9.iterator();
while (it2.hasNext())
{
String bom2 = (String) it2.next();
String[] bom2a = bom2.split("\\$");
if ((bom2a[2].equals(boma[2])) && (bom2a[1].equals(boma[1])) && (bom2a[3].equals(boma[3])))
{
bomal2.add(bom2); }
// i01++;
}
filter = filter + boma[0] + "$" + boma[2] + "$" + boma[3] + ";";
}
else if (filter.contains(boma[2] + "$" + boma[3] + ";"))
{ }
else
{
Iterator it2 = al9.iterator();
while (it2.hasNext())
{
String bom2 = (String) it2.next();
String[] bom2a = bom2.split("\\$");
if ((bom2a[2].equals(boma[2])) && (bom2a[1].equals(boma[1])) && (bom2a[3].equals(boma[3])))
{
bomal2.add(bom2); }
// i02++;
}
filter = filter + boma[2] + "$" + boma[3] + ";";
}
}
{
}
else if (filter.contains(boma[2] + "$" + boma[3] + ";"))
{
}
else
{
}这段把第一个IF删除,直接
if (filter.contains(boma[2] + "$" + boma[3] + ";"))
{
}
else
{
}
这样逻辑的复杂性降低了一点,代码更好懂。
{
ArrayList<String> list = new ArrayList<String>();
ArrayList<String> bomal2 = new ArrayList<String>();
String filter = "";
for (String bom : list)
{
String[] bomArray = bom.split("\\$");
String key = bomArray[2] + "$" + bomArray[3] + ";";
if (filter.contains(key))
{
}
else
{
for (String bom2 : list)
{
String[] bom2Array = bom2.split("\\$");
if ((bom2Array[1].equals(bomArray[1])) && (bom2Array[2].equals(bomArray[2])) && (bom2Array[3].equals(bomArray[3])))
{
bomal2.add(bom2);
}
}
filter = filter + key;
}
}
}
性能肯定没有提升,但代码更精炼,更易读了。
楼主的程序逻辑是不是有问题:楼主程序是根据后三项判重复,有重复的则加入另一LIST。但没排除元素本身与自己相等的情况。
if (filter.contains(key))
filter里的值是 12$13;14$15;...
这样以后有 2$13;的也会 contains() 为 true, 这不知道是需求还是BUG
是BUG的话可以使filter初始为";", 比较时这样:
if (filter.contains(";" + key))
在key前加个分号。当然,楼主这样的做法必须保证值是不会有分号。
我觉得contains还是用hashSet快些
{
public static void main(String[] args)
{
final ArrayList<String> list = new ArrayList<String>(10000);
ArrayList<String> bomal2 = new ArrayList<String>(10000);
Random r = new Random();
for (int i = 0; i < 10000; i++)
{
String v = "$" + r.nextInt(100) + "$" + r.nextInt(100) + "$" + r.nextInt(100);
list.add(v);
} System.out.println("start ... ");
long start, end; start = System.currentTimeMillis();
test(list, bomal2);
end = System.currentTimeMillis();
System.out.println("\ntest spend: " + (end - start));
System.out.println("size: " + bomal2.size());
System.out.println(bomal2.toString()); bomal2.clear(); start = System.currentTimeMillis();
test2(list, bomal2);
end = System.currentTimeMillis();
System.out.println("\ntest2 spend: " + (end - start));
System.out.println("size: " + bomal2.size());
System.out.println(bomal2.toString());
bomal2.clear(); start = System.currentTimeMillis();
test3(list, bomal2);
end = System.currentTimeMillis();
System.out.println("\ntest3 spend: " + (end - start));
System.out.println("size: " + bomal2.size());
System.out.println(bomal2.toString()); bomal2.clear(); start = System.currentTimeMillis();
test4(list, bomal2);
end = System.currentTimeMillis();
System.out.println("\ntest4 spend: " + (end - start));
System.out.println("size: " + bomal2.size());
System.out.println(bomal2.toString());
} public static void test(final ArrayList<String> list, ArrayList<String> bomal2)
{
String filter = ";";
for (String bom : list)
{
String[] bomArray = bom.split("\\$");
String key = bomArray[2] + "$" + bomArray[3] + ";";
if (!filter.contains(";" + key))
{
for (String bom2 : list)
{
String[] bom2Array = bom2.split("\\$");
if ((bom2Array[1].equals(bomArray[1])) && (bom2Array[2].equals(bomArray[2]))
&& (bom2Array[3].equals(bomArray[3])))
{
bomal2.add(bom2);
}
}
filter = filter + key;
}
}
System.out.println(filter);
} public static void test2(final ArrayList<String> list, ArrayList<String> bomal2)
{
Set<String> filter = new HashSet<String>(list.size() / 2);
for (String bom : list)
{
String[] bomArray = bom.split("\\$");
String key = bomArray[2] + "$" + bomArray[3];
if (!filter.contains(key))
{
for (String bom2 : list)
{
String[] bom2Array = bom2.split("\\$");
if ((bom2Array[1].equals(bomArray[1])) && (bom2Array[2].equals(bomArray[2]))
&& (bom2Array[3].equals(bomArray[3])))
{
bomal2.add(bom2);
}
}
filter.add(key);
}
}
System.out.println(filter);
} public static void test3(final ArrayList<String> list, ArrayList<String> bomal2)
{
Set<String> filter = new HashSet<String>(list.size() / 2);
for (String bom : list)
{
int bomIdx1 = bom.indexOf("$");
int bomIdx2 = bom.indexOf("$", bomIdx1 + 1);
String key = bom.substring(bomIdx2 + 1);
if (!filter.contains(key))
{
for (String bom2 : list)
{
if (bom2.endsWith(bom.substring(bomIdx1)))
{
bomal2.add(bom2);
}
}
filter.add(key);
}
}
System.out.println(filter);
} public static void test4(final ArrayList<String> list, ArrayList<String> bomal2)
{
Set<String> filter = new HashSet<String>(list.size() / 2);
for (int i = 0; i < list.size(); i++)
{
String bom = list.get(i);
int bomIdx1 = bom.indexOf("$");
int bomIdx2 = bom.indexOf("$", bomIdx1 + 1);
String key = bom.substring(bomIdx2 + 1);
if (!filter.contains(key))
{
for (int j = i; j < list.size(); j++)
{
String bom2 = list.get(j);
if (bom2.endsWith(bom.substring(bomIdx1)))
{
bomal2.add(bom2);
}
}
filter.add(key);
}
}
System.out.println(filter);
}
}以下是我的测试结果start ... test spend: 10842
size: 6350test2 spend: 10187
size: 6350test3 spend: 1321
size: 6350test4 spend: 672
size: 6350
for(String bom : al9){
String[] boma = bom.split("\\$");
if(!filter.contains(boma[2] + "$" + boma[3])){
for(String bom2 : al9){
String[] bom2a = bom2.split("\\$");
if ((bom2a[2].equals(boma[2]))
&& (bom2a[1].equals(boma[1]))
&& (bom2a[3].equals(boma[3]))) {
bomal2.add(bom2);
}
}
filter.add(boma[2] + "$" + boma[3]);
}
}
{
ArrayList<String> list = new ArrayList<String>();
ArrayList<String> bomal2 = new ArrayList<String>();
String filter = "";
for (String bom : list)
{
String[] bomArray = bom.split("\\$");
String key = bomArray[2] + "$" + bomArray[3] + ";";
if (filter.contains(key))
{
}
else
{
for (String bom2 : list)
{
String[] bom2Array = bom2.split("\\$");
if ((bom2Array[1].equals(bomArray[1])) && (bom2Array[2].equals(bomArray[2])) && (bom2Array[3].equals(bomArray[3])))
{
bomal2.add(bom2);
}
}
filter = filter + key;
}
}
}
也就是在一个千万笔数据中找出 boma[1],boma[2],boma[3]相同的留下其他都删除
看你这几天连续发贴,都是问此类相关问题,应该是工作上的问题吧。客观地说,自己开发这种代码投资收益比太低,因为涉及较多环节的工作量,如果你不是为了锻炼自己能力的话,建议:
1、将数据导入Oracle中,千万级数据,导入并不会太久,应该不会超过1个小时。
2、用Select ... Group By ... Having 一句话得到你要的结果集,千万级记录对Oracle来说不算太大,预先建好索引,然后打开并发计算参数。
感谢你的建议但对把逻辑转成sql不太熟悉
语句可能比你想象的要简单多,你可以用数据库弄个7、8行简单模拟下试试看。按照你的要求:“也就是在一个千万笔数据中找出 boma[1],boma[2],boma[3]相同的留下其他都删除”。
假设将这三个数,存入数据库的字段名分别为: fa, fb, fc,表名为 tab
你需要的SQL如下:Select fa, fb, fc, count(*) As num /* num 是重复出现的次数*/
From tab
Group By fa, fb, fc /* 分组 */
Having count(*) > 1 /* 出现超过1次的才算 */That's all. 当然,如果你确实是为了锻炼自己的设计开发能力那是另当别论了。
但能力有限,对算法和数据结构知识匮乏;
对Java-正则表达式知识也很肤浅,出于学习Java-正则表达式的目的;改写了下;
测试结果 速度快了一点,不知道内存怎么样(性能不知道是否考虑空间); static String charReguExp = "(([a-zA-Z]*\\$)([a-zA-Z]*\\$([a-zA-Z]*\\$[a-zA-Z]*)\\$)[a-zA-Z]*)";
static Pattern pattern = Pattern.compile(charReguExp);
static StringBuffer sb = new StringBuffer("");
// static List<String> al9 = Arrays.asList("aaa$bbb$ccc$ddd$",
// "bbb$hhh$ccc$ddd$ttt",
// "ttt$jj$nnn$ddd$ooo",
// "bbb$hhh$cc$ddd$ttt");
static List<String> al9 = new ArrayList<String>();
static{
for(int i = 1; i <= 5000000; i++){
for(int j = 1; j <= 4; j++){
al9.add("aaa$bbb$ccc$ddd$");
al9.add("bbb$hhh$ccc$ddd$ttt");
al9.add("ttt$jj$nnn$ddd$ooo");
al9.add("bbb$hhh$cc$ddd$ttt");
}
}
}
public static void main(String[] args) {
prossessByRegEx(al9);
System.out.println("============");
process(al9);
}
public static void prossessByRegEx(List<String> al9){
long start = System.currentTimeMillis();
ArrayList<String> bomal2 = new ArrayList<String>();
StringBuffer filter = new StringBuffer("");
addValueForList(al9.get(0), al9, bomal2);
filter.append(getString(al9.get(0), 2, 4));
for(int i = 1; i < al9.size(); i++){
if(!contains(filter.toString(), al9.get(i))){
addValueForList(al9.get(i), al9, bomal2);
filter.append(getString(al9.get(i), 4));
}
}
long end = System.currentTimeMillis();
System.out.println(end - start);
System.out.println(bomal2.size());
// for(String str : bomal2){
// System.out.println(str);
// }
System.out.println(filter.toString());
} private static String getString(String strValue, int... groups){
sb.delete(0, sb.length());
Matcher regexMatcher = pattern.matcher(strValue);
if(regexMatcher.find()){
for(int i : groups){
sb.append(regexMatcher.group(i));
}
}
if(sb.length() > 0){
sb.append(";");
}
return sb.toString();
}
private static boolean contains(String filter, String strValue){
return filter.contains(getString(strValue, 4));
}
private static void addValueForList(String strValue,List<String> al9, List<String> bomal2){
String temp = getString(strValue, 3);
for(String val : al9){
if(temp.equals(getString(val, 3))){
bomal2.add(val);
}
}
}
public static void process(List<String> al9) {
long start = System.currentTimeMillis();
ArrayList<String> bomal2 = new ArrayList<String>();
String filter = "";
Iterator<String> it = al9.iterator();
Iterator itt = al9.iterator();
while (itt.hasNext()) {
String bom = (String) itt.next();
String[] boma = bom.split("\\$");
if (filter.equals("")) {
Iterator it2 = al9.iterator();
while (it2.hasNext()) {
String bom2 = (String) it2.next();
String[] bom2a = bom2.split("\\$");
if ((bom2a[2].equals(boma[2]))
&& (bom2a[1].equals(boma[1]))
&& (bom2a[3].equals(boma[3]))) {
bomal2.add(bom2); }
// i01++;
}
filter = filter + boma[0] + "$" + boma[2] + "$" + boma[3] + ";";
} else if (filter.contains(boma[2] + "$" + boma[3] + ";")) { } else {
Iterator it2 = al9.iterator();
while (it2.hasNext()) {
String bom2 = (String) it2.next();
String[] bom2a = bom2.split("\\$");
if ((bom2a[2].equals(boma[2]))
&& (bom2a[1].equals(boma[1]))
&& (bom2a[3].equals(boma[3]))) {
bomal2.add(bom2); }
// i02++;
}
filter = filter + boma[2] + "$" + boma[3] + ";";
} }
long end = System.currentTimeMillis();
System.out.println(end - start);
System.out.println(bomal2.size());
// for(String str : bomal2){
// System.out.println(str);
// }
System.out.println(filter);
}
HashMap<String,Integer> counter = new HashMap<String,Integer>();
StringBuilder buff = new StringBuilder();
for(String bom : al9){
String[] boma = bom.split("\\$");
buff.setLength(0);
buff.append(boma[1]).append('$').append(boma[2]).append('$').append(boma[3]);
String key = buff.toString();
if(counter.get(key)==null){
counter.put(key, Integer.valueOf(1));
}else{
counter.put(key, counter.get(key)+1);
}
}
for(String bom : al9){
String[] boma = bom.split("\\$");
buff.setLength(0);
buff.append(boma[1]).append('$').append(boma[2]).append('$').append(boma[3]);
String key = buff.toString();
int c = counter.get(key);
if(c>1){
bomal2.add(bom);
}
}
}
有1800万笔每个字符串有100~200个字因为是各字段相加
用java -Xmx30000m 运行看系统监视该进程达到13xxxM的使用量本服务器有49G的内存
后来我发现可以在数据库中获取数据时直接筛选会比较节省时间但这个讨论还是帮了我很多
也谢谢弟兄们给的建议让我在第一次对千万级数据量处理完成