//要是字符串里只有一个“有效期”,那么下面方法可行。 int index = str.indexOf("有效期"); int start = str.lastIndexOf("<br", index); int end = str.indexOf("<br", index); System.out.println(str.substring(start,end+6));
//最终是这么整的: Matcher m = Pattern.compile("(?is)((.(?!(<br.*?/?>|</?div.*?>)))*?"+keywords+".*?)(<br.*?/?>|</?div.*?>)").matcher(description);里面有的是用<br>有的是用<div>来换行的,<br>和<div>里面有的还带有style。 keywords可以用多个,比如: String businessHour = getLineFromDescription(description,"(营业时间|用餐时间|使用时间)");
Matcher m = Pattern.compile("<br /></strong><span>.*?有效期.*?<br />").matcher(str);
Matcher m = Pattern.compile("(?is)(?<=<br />).*?(?=<br />)").matcher(str);
while(m.find()){
// System.out.println(m.group());
if(m.group().contains("有效期")){
System.out.println(m.group());
}
}
}
<span>
尊敬的XXOO
<br />
你好:
</span>
有效期
<p>
截至时间为:<br />
3月底
</p>
截取完毕后变成:
<br /> 你好:</span>有效期<p> 截至时间为:<br />
还是说不用考虑这么多,反正就是<br />为界,不管三七二十一的截取就是。
int index = str.indexOf("有效期");
int start = str.lastIndexOf("<br", index);
int end = str.indexOf("<br", index);
System.out.println(str.substring(start,end+6));
//最终是这么整的:
Matcher m = Pattern.compile("(?is)((.(?!(<br.*?/?>|</?div.*?>)))*?"+keywords+".*?)(<br.*?/?>|</?div.*?>)").matcher(description);里面有的是用<br>有的是用<div>来换行的,<br>和<div>里面有的还带有style。
keywords可以用多个,比如:
String businessHour = getLineFromDescription(description,"(营业时间|用餐时间|使用时间)");
<br.*?/?>改成 <br\\s*/?\\s*>好一些吧,否则如 <brabcd/>标签也被抽选了,div同理
开始测的时候,经常把
“山城的冬日让人渴望呵护,总想寻找一个养生圣地,休养生息”这一行匹配出来。
楼主威武。
//正则的效率太低,处理10000条数据差不多要30多秒,生产环境没法用!
//换用下面的实现方式,10000条数据大概是3秒
//大家帮忙看看还有更好的实现方式没?
public static String getSpecialMessage(String specialMessage){
if(specialMessage == null || specialMessage.equals("")){
return "";
}
StringBuilder sb = new StringBuilder();
String deadline = "";
String businessHour = "";
//按照br或者div来拆分
String arr[] = specialMessage.split("(?is)</?div.*?>|<br.*?>");
for(int i=0;i<arr.length;i++){
//有效期
if(deadline.equals("") && arr[i].indexOf("有效期") >= 0){
deadline = arr[i];
}
// 营业时间 使用时间 用餐时间 消费时间 消费时段 接待时间
if(businessHour.equals("") && arr[i].indexOf("营业时间") >= 0){
businessHour = arr[i];
}
if(businessHour.equals("") && arr[i].indexOf("使用时间") >= 0){
businessHour = arr[i];
}
if(businessHour.equals("") && arr[i].indexOf("用餐时间") >= 0){
businessHour = arr[i];
}
if(businessHour.equals("") && arr[i].indexOf("消费时间") >= 0){
businessHour = arr[i];
}
if(businessHour.equals("") && arr[i].indexOf("消费时段") >= 0){
businessHour = arr[i];
}
if(businessHour.equals("") && arr[i].indexOf("接待时间") >= 0){
businessHour = arr[i];
}
if(!businessHour.equals("")&&!deadline.equals("")){
break;
}
}
//去掉所有的标签
deadline = deadline.replaceAll("<.*?>", "").trim();
businessHour = businessHour.replaceAll("<.*?>", "").trim();
//先把有效期放进去
if(!deadline.equals("")){
sb.append(deadline);
}
//防止二者重复
if(!businessHour.equals("") && !businessHour.equals(deadline)){
sb.append(businessHour);
}
return sb.toString();
}
//按这种实现方式大概需要600ms
public static String getSpecialMessage(String specialMessage){
if(specialMessage == null || specialMessage.equals("")){
return "";
}
String deadline = getLineByKeywords(specialMessage,"有效期");
String businessHour = getLineByKeywords(specialMessage,"营业时间,使用时间,用餐时间,消费时间,消费时段,接待时间");
if(deadline.equals(businessHour)){
return deadline;
}else{
return deadline+businessHour;
}
}
public static String getLineByKeywords(String specialMessage,String keywords){
if(specialMessage == null || specialMessage.equals("")){
return "";
}
String keywordsArr[] = keywords.split(",");
String result = "";
for(String keyword : keywordsArr){
result = getLineByKeyword(specialMessage,keyword);
if(!result.equals("")){
return result;
}
}
return result;
}
public static String getLineByKeyword(String specialMessage,String keyword){
int index = specialMessage.indexOf(keyword);
int startBr = specialMessage.lastIndexOf("<br", index);
if(startBr <= 0){
startBr = 0;
}
int startDiv1 = specialMessage.lastIndexOf("<div", index);
if(startDiv1 <= 0){
startDiv1 = 0;
}
int startDiv2 = specialMessage.lastIndexOf("</div", index);
if(startDiv2 <= 0){
startDiv2 = 0;
}
int start = max(startBr,startDiv1,startDiv2);
int endBr = specialMessage.indexOf("<br", index);
if(endBr <= 0){
endBr = specialMessage.length();
}
int endDiv1 = specialMessage.indexOf("<div", index);
if(endDiv1 <= 0){
endDiv1 = specialMessage.length();
}
int endDiv2 = specialMessage.indexOf("</div", index);
if(endDiv2 <= 0){
endDiv2 = specialMessage.length();
}
int end = min(endBr,endDiv1,endDiv2);
String line = specialMessage.substring(start, end);
line = line.replaceAll("<.*?>", "").trim();
return line;
}
public static int max(int a,int b,int c){
return a>b?(a>c?a:c):(b>c?b:c);
}
public static int min(int a,int b,int c){
return a<b?(a<c?a:c):(b<c?b:c);
}
package net.csdn.bbs.goldenfish1919;public class ACMatcher { private static final int Default_Table_Size = 64;
private static final int Default_Table_Increasement = 32; public interface LineHandler{
void onMatch(String data,String pattern,int start,int keyIndex,int end);
}
public static enum PatternType {
Start, End, Key
}; class Node {
int index;
char ch;
int[] nexts;
int failure;
String[] patterns;
PatternType type; public Node(char ch) {
this.ch = ch;
if (table.length < tableSize + 1) {
Node[] nodes = new Node[table.length
+ Default_Table_Increasement];
System.arraycopy(table, 0, nodes, 0, tableSize);
table = nodes;
}
this.index = tableSize;
table[tableSize++] = this;
failure = 0;
} public boolean containsChildCharacter(char c) {
if (nexts != null) {
for (int index : nexts) {
if (table[index].ch == c) {
return true;
}
}
}
return false;
} public Node getChild(char c) {
if (nexts != null) {
for (int index : nexts) {
if (table[index].ch == c) {
return table[index];
}
}
}
return null;
} public void addChild(Node node) {
if (nexts == null) {
nexts = new int[] { node.index };
} else {
int[] childs = new int[nexts.length + 1];
System.arraycopy(nexts, 0, childs, 0, nexts.length);
childs[nexts.length] = node.index;
nexts = childs;
}
} public void addPattern(String pattern) {
if (patterns == null) {
patterns = new String[] { pattern };
} else {
String[] newPatterns = new String[patterns.length + 1];
System.arraycopy(patterns, 0, newPatterns, 0, patterns.length);
newPatterns[patterns.length] = pattern;
patterns = newPatterns;
}
}
} private Node table[] = new Node[Default_Table_Size];
private int tableSize = 0; public ACMatcher(String[] starts, String[] ends, String[] keys) {
table[0] = new Node((char) 0);
tableSize = 1;
for (String pattern : starts) {
addPattern(pattern, PatternType.Start);
}
for (String pattern : ends) {
addPattern(pattern, PatternType.End);
}
for (String pattern : keys) {
addPattern(pattern, PatternType.Key);
}
}
public ACMatcher(String[] keys){
this(new String[]{"<div>"},new String[]{"</div>","<br />"},keys);
} private void addPattern(String pattern, PatternType type) {
char[] chs = pattern.toCharArray();
Node current = table[0];
for (char ch : chs) {
if (current.containsChildCharacter(ch)) {
current = current.getChild(ch);
} else {
Node node = new Node(ch);
current.addChild(node);
current = node;
/** 生成失配的路线数据 */
for (int k = 1; k < tableSize - 1; k++) {
if (table[k].ch == ch) {
current.failure = k;
break;
}
}
}
}
current.type = type;
current.addPattern(pattern);
/** 生成相同后缀的模式串信息 */
for(int k=1;k<tableSize-1;k++){
if(table[k].patterns!=null){
for(String suffix : table[k].patterns){
if(pattern.endsWith(suffix)){
current.addPattern(suffix);
}
}
}
}
} public void match(String data,LineHandler handler){
Node node = table[0];
char[] chs = data.toCharArray();
int start = 0,keyIndex = 0;
for(int i=0;i<chs.length;i++){
if(node.containsChildCharacter(chs[i])){
node = node.getChild(chs[i]);
if(node.patterns!=null){
switch(node.type){
case Start:start=i;break;
case End:
if(keyIndex>0)
for(String pattern : node.patterns){
handler.onMatch(data, pattern,start+1,keyIndex,i-pattern.length()+1);
}
start=i;
keyIndex=0;break;
case Key:
keyIndex = keyIndex==0?i:keyIndex;break;
}
}
}else{
node = table[node.failure];
}
}
}
}