不知有没有帮助
文法={Vt, Vn, S, p}
Vt={字母,',','\r\n','\r'}
Vn={CVS,字符串,字符串1,字符串2,字母}
S=CVSp:
CVS -> 字符串(,字符串)*
字符串 -> 字符串1 | "字符串1" | "字符串2"
字符串1 -> 字母+
字符串2 -> 字母+,字母+ | 字母+\r字母+ | 字母+\r\n字母+
字母 -> a|b|c|………… |z
文法={Vt, Vn, S, p}
Vt={字母,',','\r\n','\r'}
Vn={CVS,字符串,字符串1,字符串2,字母}
S=CVSp:
CVS -> 字符串(,字符串)*
字符串 -> 字符串1 | "字符串1" | "字符串2"
字符串1 -> 字母+
字符串2 -> 字母+,字母+ | 字母+\r字母+ | 字母+\r\n字母+
字母 -> a|b|c|………… |z
import org.apache.regexp.*;/* Simple demo of CSV matching using Regular Expressions.
* Does NOT use the "CSV" class defined in the Java CookBook.
* RE Pattern from Chapter 7, Mastering Regular Expressions (p. 205, first edn.)
*/
public class CSVRE {
/** The rather involved pattern used to match CSV's consists of three
* alternations: the first matches quoted fields, the second unquoted,
* the third null fields
*/
public static final String CSV_PATTERN =
"\"([^\"\\\\]*(\\\\.[^\"\\\\]*)*)\",?|([^,]+),?|,"; public static void main(String[] argv) throws IOException, RESyntaxException
{
String line;
// Construct a new Regular Expression parser.
Debug.println("regexp", "PATTERN = " + CSV_PATTERN); // debug
RE csv = new RE(CSV_PATTERN); BufferedReader is = new BufferedReader(new InputStreamReader(System.in)); // For each line...
while ((line = is.readLine()) != null) {
System.out.println("line = `" + line + "'"); // For each field
for (int fieldNum = 0, offset = 0; csv.match(line, offset); fieldNum++) { // Print the field (0=null, 1=quoted, 3=unquoted).
int n = csv.getParenCount()-1;
if (n==0) // null field
System.out.println("field[" + fieldNum + "] = `'");
else
System.out.println("field[" + fieldNum + "] = `" + csv.getParen(n) + "'"); // Skip what already matched.
offset += csv.getParen(0).length();
}
}
}
}
class Debug {
/** Static method to see if a given category of debugging is enabled.
* Enable by setting e.g., -Ddebug.fileio to debug file I/O operations.
* Use like this:<BR>
* if (Debug.isEnabled("fileio"))<BR>
* System.out.println("Starting to read file " + fileName);
*/
public static boolean isEnabled(String category) {
return System.getProperty("debug." + category) != null;
} /** Static method to println a given message if the
* given category is enabled for debugging.
*/
public static void println(String category, String msg) {
if (isEnabled(category))
System.out.println(msg);
}
/** Same thing but for non-String objects (think of the other
* form as an optimization of this).
*/
public static void println(String category, Object stuff) {
println(category, stuff.toString());
}
}
我看了一下, 那使用的是is.readLine(),
这样如果,,中间本身有\r\n,但是这个并不表示下一行的,所以读csv的行就不对了。而且它使用的是
import org.apache.regexp.*;
这个包, 不是1.4自带的, 你能不能帮我移植过去?
我这里没有org.apache.regexp的文档看别人德正则表达式满难懂的。
我这样试过
public static final String CSV_PATTERN =
"\"([^\"\\\\]*(\\\\.[^\"\\\\]*)*)\",?|([^,]+),?|,";
java.util.regex.Pattern p = java.util.regex.Pattern.compile(CSV_PATTERN);
Matcher matcher = p.matcher("abc,\"a,bbc\",ad");
System.out.println(matcher.matches());出来的是false,应该怎么写呢??说实话那个表达式我看不懂:(
<!--
var sourceString="abc,\"a,b\",abdd,\"this,this,this\",c"; // mach "xxx"
var reg1=/^\"[^\"]+\"/; // mach xxx
var reg2=/^\w+,/; document.writeln("Source String:" + sourceString + " <br>" );
document.writeln("<hr>");
document.writeln("<table border='1'>");
document.writeln("<tr><td>Mached</td><td>Remain</td></tr>");
while (true)
{ if (sourceString.length == 0)
break; var result1=reg1.exec(sourceString);// match
var result2=reg2.exec(sourceString);
document.writeln("<tr>"); if(result1 != null) // find the "xxxx",
{
// delete the ""
var strTemp=result1[0].substring(1,result1[0].length-1);
document.writeln("<td>" + strTemp + " </td>" );
sourceString=sourceString.substring(result1.index+result1[0].length+1);
document.writeln("<td>" + sourceString + "</td>");
}
else if (result2 != null) // find the xxx,
{
// delete the "," char
var strTemp=result2[0].substring(0,result2[0].length-1); document.writeln("<td>" + strTemp + " </td>" );
sourceString=sourceString.substring(result2.index+result2[0].length);
document.writeln("<td>" + sourceString + "</td>");
}
else // find nothing or the last one.
{
document.writeln("<td>" + sourceString + " </td>" );
document.writeln("<td> NA </td>"); sourceString="";
}
document.writeln("</tr>");
} document.writeln("</table>");
//-->
</SCRIPT>有什么問題再測!呵呵...
import java.util.*;public class CSVParser { public static ArrayList parse(String str1) {
ArrayList list = new ArrayList();
Pattern p = Pattern.compile("\"([^\"]+)\"|([^,]+)");
Matcher m = p.matcher(str1);
while(m.find()) {
list.add(m.group().replaceAll("\"", ""));
}
return list;
} public static void main(String[] args) throws Throwable {
String csv1 = "a,bc,\"a,b\",bc,c";
String csv2 = "abc,\"abc\",c";
ArrayList list1 = CSVParser.parse(csv1);
System.out.println("list1");
for (int i =0; i < list1.size(); i++)
System.out.println(list1.get(i));
ArrayList list2 = CSVParser.parse(csv2);
System.out.println("list2");
for (int i =0; i < list2.size(); i++)
System.out.println(list2.get(i));
}
}
这样是不是把不是开头的"也去掉了。PS刚才没有说"也是个特殊字符。
要是"是内容的一部分也应该要
ac,"aaaba"aaa",ab真正内容是
ac
aaaba"aaa
ab楼上的不会觉得我太过分吧?
真的都做出来了, 我还可以再加分
public static void main(String[] args) {
String targetStr = "aaa,\"b b\",\"c,c\"";
String regStr = "([\\p{Alnum}]+)" +
"|(\"[\\p{Alnum}\\p{Space},]+)\"";
Pattern patt = Pattern.compile(regStr);
Matcher m = patt.matcher(targetStr);
int count = 0; while (m.find()) {
String str = m.group();
if (str.charAt(0) == '\"') {
str = str.substring(1, str.length()-1);
}
count++;
System.out.println("Matche #" + count + ": " + str);
} System.out.println("Total " + m.groupCount() + " result(s) found!\n");
}
}
改了一下,效果不是很好import java.util.regex.*;
import java.util.*;
import java.io.*;public class CSVParser { private StringBuffer buf = null; public ArrayList parse() {
String str;
ArrayList list = new ArrayList();
Pattern p = Pattern.compile("(\"(.*)\")|(([^,\\x0d]+))");
//Pattern p = Pattern.compile("([^\\x0d]+)");
Matcher m = p.matcher(buf); while(m.find()) {
str = m.group();
//System.out.println(str);
//str = str.replaceAll("(?sm)(\"(\"))|(\"([^\"])?)|(^\\x0a(.*))", "$2");
str = str.replaceAll("(?sm)(^\\x0a(.*))", "$2");
str = str.replaceAll("(?sm)(^\"(.*)\"$)", "$2");
str = str.replaceAll("(?sm)(\"(\"))", "$2");
//System.out.println(str);
list.add(str);
}
return list;
} public void readCSVFile(File CSVFile) {
int chr;
buf = new StringBuffer("");
try {
BufferedReader in = new BufferedReader(new FileReader(CSVFile));
while((chr=in.read()) != -1) {
buf.append((char)chr);
//System.out.print((char)chr);
}
in.close();
System.out.println("-------------csv file content--------------");
System.out.print(buf);
} catch (FileNotFoundException fnfex) {
fnfex.printStackTrace();
} catch (IOException ioex) {
ioex.printStackTrace();
}
} public static void main(String[] args) throws Throwable {
File file = new File("Book1.csv");
CSVParser parser = new CSVParser();
parser.readCSVFile(file);
ArrayList list = parser.parse();
int size = list.size();
System.out.println("\n\n--------------parsed result------------------");
System.out.println("size = " + size);
for (int i = 0; i < size; i++) {
System.out.println("list[" + i + "] = " + list.get(i));
}
}
}
用notepad打开的Book1.csv内容
a,bc,"a,b",bc,c
a,ab c,c,,
a,"aaaba""aaa",ab,,
a,"a
b
c","a b ,c",d,运行结果
-------------csv file content--------------
a,bc,"a,b",bc,c
a,ab c,c,,
a,"aaaba""aaa",ab,,
a,"a
b
c","a b ,c",d,
--------------parsed result------------------
size = 16
list[0] = a
list[1] = bc
list[2] = a,b
list[3] = bc
list[4] = c
list[5] = a
list[6] = ab c
list[7] = c
list[8] = a
list[9] = aaaba"aaa
list[10] = ab
list[11] = a
list[12] = a
b
c
list[13] = a b ,c
list[14] = d
list[15] =
让我郁闷的是不知道为什么要多次replaceAll,此处效率低