正则问题

String zhPattern = "[\u4e00-\u9fa5]+"; 这个是提取汉字。你最好给个例子，和你期望的结果！

在老紫竹的基础上，写了个笨方法，如下：import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;/**
* 测试1：文档注释
*/
public class SrcZhcnInfo { public static final String PATH = "C:/test/SrcZhcnInfo.java"; //此源代码文件自身
public static final String REGX1 = "(?s)(?:/\\*\\*?.*?\\*/)|(?sm)(?://.*?$)";
public static final String REGX2 = "[\u4e00-\u9fa5]+"; /*
* 测试2：多行注释
*/
public static void main(String[] args) {
//测试3：单行注释
@SuppressWarnings("unused")
String teststr = "非注释中文文本";
@SuppressWarnings("unused")
String 中文变量名 = null;
StringBuilder sb = new StringBuilder(512);
BufferedReader br = null;
try {
br = new BufferedReader(new FileReader(PATH));
char[] buf = new char[64];
int len = 0;
while ((len = br.read(buf)) != -1) {
sb.append(buf, 0, len);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (br != null) {
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
String input = null;
Pattern p1 = Pattern.compile(REGX1);
Matcher m1 = p1.matcher(sb);
input = m1.replaceAll("");
Pattern p2 = Pattern.compile(REGX2);
Matcher m2 = p2.matcher(input);
int n = 0;
while (m2.find()) {
String ms = input.substring(m2.start(), m2.end());
System.out.println(++n + ": [" + ms + "]");
}
}}输出：
1: [非注释中文文本]
2: [中文变量名]

"(?s)(?:/\\*\\*?.*?\\*/)|(?sm)(?://.*?$)";

我想把比如这样的
String s1 = "嘿嘿 ";
String s1 = "throw new Exception(\"已经超过20项笔记。\");";
将\"已经超过20项笔记。\"将这段提取出来
反正是双引号之间的

String s = "中文"+"yinwen"+"中文"
StringBuffer sb = new StringBuffer();
sb.append("中文＋");
主要是想记录行号所以只能一行一行匹配了

字符串输入源是一个 Java 文件，而且还得是非注释中的引号中的汉字。你这样做等于让正则表达式来做语法分析，实际上大家都知道正则表达式不是用来干这个的。举几个例子就知道要用正则实现有多少困难：1，String str = "重阳节";   // String str = "重阳节";
目的：抽取“重阳节”，但后面有个注释！2，String str = "今天是10月7日\"重阳节\"";
目的：抽取“今天是10月7日"重阳节"”，但若以引号分实际可能是“今天是10月7日”和空串3，String str = "重阳节\"你好";  // 重阳节"你好
目的：抽取“重阳节"你好”，但由于引号的关系可能是“重阳节”和“;  // 重阳节”4，String str = "重阳节 // String str = \"重阳节\";";
目的：抽取“重阳节 // String str = "重阳节";”，呵呵，这个就更加复杂了！5，String str = "重阳节 // String str = \"重阳节\";";  // String str = "重阳节 // String str = \"重阳节\";";
目的：…………

1.首先我用流逐行读取java文本
2.然后把其中双引号中的带有中文的找出来结果输出:
"中aaa外那"
"中文"
"中ss文+" "中文"
特别注意是"+"号连起来的
可以不考虑注释,注释已经解决下边是一个被读取文件的片段StringBuffer sb = new StringBuffer();
String y = "中aaa外那 ";
sb.append("中文");
String s = "中ss文+"+"yinwen"+"中文";
throw new Exception("中文");

这个正则的确很麻烦，不知道火龙大哥会写出什么样的，期待ING~

import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;public class RegexTest {    public static void main(String[] args) {
        String filename = "d:/test.java";

        LineNumberReader reader = null;
        try {
            reader = new LineNumberReader(new FileReader(filename));

            // /* */ 式的单行注释
            String commentRegex1 = "/\\*[^*]*\\*+(?:[^/*][^*]*\\*+)*/";            //  // 式的单行注释
            String commentRegex2 = "//[^\n]*+";            // 带有汉字的字符串模式
            String quoteChineseRegex = "\"(?=(?:\\\\.|[^\"])*[\u4e00-\u9faf])[^\\\\\"]*(?:\\\\.[^\\\\\"]*)*\"";            // 普通的字符串模式
            String quoteRegex = "\"[^\\\\\"]*(?:\\\\.[^\\\\\"]*)*\"";

            String addition = "[^\"/]";            // 用于去除行末注释
            String quoteStringRegex = addition + "+|(" + quoteRegex + ")" + addition + "*|" + commentRegex1 + "|" + commentRegex2;

            Matcher comment1        = Pattern.compile(commentRegex1).matcher("");
            Matcher comment2        = Pattern.compile(commentRegex2).matcher("");
            Matcher quote           = Pattern.compile(quoteChineseRegex).matcher("");
            Matcher quoteString     = Pattern.compile(quoteStringRegex).matcher("");

            boolean isMultiLineComment = false;
            for(String str = null; (str = reader.readLine()) != null; ) {
                str = str.trim();
                // 忽略单行的 /*  */ 和 // 注释
                  if(comment1.reset(str).matches() || comment2.reset(str).matches()) {
                    continue;
                }
                // 多行注释处理
                  if(str.startsWith("/*")) {
                    isMultiLineComment = true;
                }
                if(str.startsWith("*/")) {
                    isMultiLineComment = false;
                    continue;
                }
                if(isMultiLineComment) {
                    continue;
                }

                // 抽取所有的字符串
                  str = quoteString.reset(str).replaceAll("$1");

                // 抽取所有的带汉字的字符串
                  quote.reset(str);
                while(quote.find()) {
                    System.out.println(reader.getLineNumber() + ": " + quote.group());
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                reader.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }
}
测试用的文件：
import java.util.regex.Pattern;
import java.util.regex.Matcher;/**
* Test
* "你好啊"
*/
public class Test {    // main "方法"
    public static void main(String args[]){
        /* 字"符"串 */
        String str = "重阳节";   // String str = "注：重阳节";
        String str = "今天是10月7日\"重阳节\"";
        String str = "重阳节\"你好";  // 重阳节注："你好
         String str = "重阳节 // String str1 = \"重阳节\";";
        String str = "重阳节 // String str2 = \"重阳节\";";  // String str = "注：重阳节 // String str_c = \"注：重阳节\";";
        System.out.print("屏幕时刻：");
        /**
         * 注释 "\"注释\""*/
         */
        String s = second + " " + new String(chs);
        StringBuffer sb = new StringBuffer();
        String y = "中aaa外那 ";
        sb.append("中文");
        String s = "1中ss文+" + "1yin" + "1中文";
        String s = "2中ss文+" + "2yin\\\"中文\"" + "2中文";
        String s = "/*变\"\"态\\\"字\\符\\\"串*/";   /* String s = "注：/*变\"\"态\\\"字\\符\\\"串*/"; */
        throw new Exception("中文");
    }
}

涉及到双引号配对的问题
但是JAVA里的限定符我好像一直没用成功过(如\2),所以XXX...很麻烦

忽略单行注释那里if(comment1.reset(str).matches() || comment2.reset(str).matches()) {
continue;
}由于去掉了首尾的空格，可以改为：if(str.startsWith("//") || (str.startsWith("/*") && str.endsWith("*/"))) {
continue;
}我感觉最麻烦的就是注释的处理和字符串中的各种转义引号什么的。PS：注释和字符串的表达式来自 Mastering Regular Expressions 一书，正则表达式顶级专家写的，呵呵。

那个程序你试试看，还可以再优化一下，比如说：1，可以先找找读出的一行中是否用引号，没有的话直接 continue，下面的“抽取所有的字符串”都可以不用做了，
如果有引号的话，那就很难说了，因为引号可能是在字符串中的，也可能是在注释中的。2，在“抽取所有的字符串”后如果 str 为空串（说明引号是在注释中的，语句中没有引号），那也可以 continue，
下面的“抽取所有的带汉字的字符串”也可以不用做了。

呵呵，这几个不是麻烦了，是极其麻烦了，专家写的东西，为了防止循环匹配，已经优化过了。[^\\\\\"] 表示除了 \ 和 " 之外的其他字符。
Java 代码中的正则表达式使用 \\\\ 表示 \ 字符，因为 \\\\ 在 Java 编译后会变成 \\，
而这个被正则表达式引擎编译后就成 \ 字符了，这里经过了双重转义。使用 \" 表示 "，因此
就是 \\\\\" 了。\\\\.[^\\\\\"]* 表示匹配 \ 加任意字符后面跟着非 \ 和 " 字符，比如：\\abc 或者 \"abc 或者 \t 什么的。这样做的目的是为了处理：
"hello, \"qiandongbo\""而且 "hello, qiandongbo\\" 这种形式最后的 \" 不至于会被认为是转义的引号。呵呵，复杂吧，弄这个脑子得高度清醒，否则一下子就糊涂掉了 ^_^

谢谢火龙大哥了
String s = "2中ss文+" + "2yin\\\"中文\"" + "2中文";
这类的好像还没有解决,昨天想了一下没有想到很好的方案

其实,这个问题可以看作是4对互不相关的括号
只要按照以下规则就可以抽取出来
A:"和"
B:"和\n
C://和\n
D:/*和*/
另外,这4对括号是互斥的
\\和\"直接成对吞入
只要把A和B括号里的汉字抽取出来就OK了

不会啊，这个能提取出"2中ss文+"
"2yin\\\"中文\""
"2中文"啊？而且你在 27 楼的示例中也是给出的。

//测试
public class test{
public static void main(String args[]) throws Exception{
java.util.regex.Pattern p = java.util.regex.Pattern.compile("^4\1");
java.util.regex.Matcher m = p.matcher("12454");
if(m.find())
System.out.println(m.group());
//String sysDate = new java.text.SimpleDateFormat("yyyyMMdd_HH_mm_ss").format(Calendar.getInstance().getTime());
//System.out.println(sysDate);
//for(int i=0;i<128;i++)
// System.out.println(i+","+((char)i));
java.io.BufferedReader br = new java.io.BufferedReader(new java.io.InputStreamReader(new java.io.FileInputStream("test.java")));
String x = "";
String y = "";
while((x=br.readLine())!=null)
y += x+"\n";
br.close();

String s = "2\"13\\\"//23\\\"857\"6\n2\"13\"/*23\"857\"6";
s = y;
String tmp = "",result = "",tmp_2="";
char[] c = s.toCharArray();
boolean b1 = true,b2 = false;
Map<String,String[]> map = new HashMap<String,String[]>();
map.put("\"",new String[]{"\"","\n"});
map.put("//",new String[]{"\n"});
map.put("/*",new String[]{"*/"});
s += "\n";
String clouser[] = null;
for(int i=0;i<c.length-1;i++){

/*
if(x2==0&&x1==0&&c[i]=='/')
x1++;
else if(x2==0&&x1==1&&c[i]=='/')
x1++;
else if(x2==0&&x1==1&&c[i]=='*')
x1+=2;
else if(x1<2&&x2==0&&c[i]=='"')
x2++;
else if(x2==1&&c[i]=='"')
x2--;
else if(c[i]=='\n')
x2=x1=0;*/
tmp = ""+c[i]+c[i+1];
if(tmp.equals("\\\"")){tmp_2+=tmp;i++;
continue;}else if(tmp.equals("\\\\")){tmp_2+=tmp;i++;
continue;}
if(b1){
if(map.containsKey(""+c[i])){
b1 = false;
b2 = true;
clouser = (String[])map.get(""+c[i]);
tmp_2 = "";
continue;
}else if(map.containsKey(tmp)){
b1 = b2 = false;
clouser = (String[])map.get(tmp);
tmp_2 = "";
i++;
continue;
}
}else{
//System.out.println("========="+clouser+","+c[i]+(clouser.charAt(0)==c[i]));
if(clouser!=null){

for(int j=0;j<clouser.length;j++){
//System.out.println(java.util.Arrays.asList(clouser));
if(clouser[j].charAt(0)==c[i]||clouser[j].equals(tmp)){
if(c[i]=='\n')
tmp_2 = "";
clouser = null;
if(b2)
result += tmp_2+"\n";
tmp_2 = "";
b1=true;
b2=false;
break;
}}}
}
tmp_2+=c[i];
//System.out.println(b2+","+result+","+tmp);
}
System.out.println(s);
System.out.println(result);
String s22 = "2中ss文+" + "2yin\\\"中文\"" + "2中文"; //String s = "2中ss文+" + "2yin\\\"中文\"" + "2中文";
String s23 = "2中ss文+" + "2yin\\\"中文\"" + "2中文"; /*String s = "2中ss文+" + "2yin\\\"中文\"" + "2中文";
String s = "2中ss文+" + "2yin\\\"中文\"" + "2中文";
String s = "2中ss文+" + "2yin\\\"中文\"" + "2中文";*/String s25 = "2中ss文x+" + "2yin\\\"中文\"" + "2中文";
String s24 = "2中ss文+" + "2yin\\\"中文\"" + "2中/*文"+"2中ss文+" + "2yin\\\"中文\"" + "2中文";
//new Sudoku(3,new int[][]{{0,3,5},{0,6,4},{1,8,1},{2,0,7},{2,3,2},{3,3,9},{3,7,5},{4,2,6},{5,1,4},{5,2,1},{5,6,3},{6,4,4},{6,6,8},{6,8,6},{7,0,9},{7,8,5},{8,5,1}});
//{13,14,15},{3,4,8},{6,4,7},{9,3,12},{8,3,4},{10,8,14},{12,5,13},{10,4,10},{0,1,7},{1,2,1},{1,8,4}}
}
}写了一个,暂时还没发现有什么问题...^4\1
12454
test.java
\n
2\"13\\\"//23\\\"857\"6\n2\"13\"/*23\"857\"6\"
\"
\n
//
\n
/*
*/
\n\\\"
\\\\\n2中ss文+
2yin\\\"中文\"
2中文
2中ss文+
2yin\\\"中文\"
2中文
2中ss文x+
2yin\\\"中文\"
2中文
2中ss文+
2yin\\\"中文\"
2中/*文
2中ss文+
2yin\\\"中文\"
2中文
F:\java>

解决方案 »