搜索引擎中长串匹配及聚类算法:后缀树算法(Java版)

前两天用C++写了一个后缀树算法,今天把它直译成java了,经过一晚上的努力终于调试通了,现在把构造后缀树的核心代码贴出来,大家可以交流一下,比如查找下一个更短的后缀时有木有更好的方法啊.
然后在测试的时候发现了一个奇怪的问题,平常都觉得C++会比Java快,然而这次测试却截然相反,都是单线程构造同样的46K全英文文本,C++用了2分零3秒,而Java仅仅用了4秒钟,这个问题我感到挺诧异的,因为我都是就想C++都要那么久那Java是不是更得等半天呢,难道Java在处理字符串上比C++有优势吗?我在C++里面用的是容器string.这个其实很值得研究一下.
题外话:我写了一个c++的程序来将c++的源代码转换为java源代码,比手工修改起来还是方便多了,如果哪位需要的话可以找我要源码,也可以探讨一下这方面经验.
此外,由于下面的代码是直译过来的,可能不太符合java的风格,看着别扭的地方还请勿拍砖. void updateTree(int start) {
Node pTmpNode;
Edge pEdge, pTmpEdge;
int i = 0, edgeNum;
int matchePos = 0;// 要匹配的位置
boolean bMatched = false;
m_pOldNode = null;

if(debug)
System.out.println(">>>>>>>>>>>>>>>> '" + subData(start, 1)+"' >>>>>>>>>>>>>>>>>>>");
// 搜索节点是否包含待加入字符
while (!bMatched) {
// 活动节点:隐含节点为激活节点,或者本身为激活节点
// 注意活动节点是非叶节点
// 1.匹配整个后缀,即结束节点
// 如果是隐含节点
if (m_activeNode.m_bImplicit) {
matchePos = m_activeNode.getImplicitPosition() + 1;
if (isDataEqual(start, matchePos)) {
bMatched = true;// 退出,因为更短的后缀也肯定是匹配的
}
}
// 显示节点
else {
edgeNum = m_activeNode.m_pNode.getChildNum();
for (i = 0; i < edgeNum; i++) {
matchePos = m_activeNode.m_pNode.getChildEdge(i).getStartPos();
if (isDataEqual(start, matchePos)) {
bMatched = true;
break;
}
}
}
if (bMatched) {
// 更新激活节点
m_activeNode.increaseImplicitNodeOffset(i);// 可能显隐互变?
}
// 最后字符不匹配
else {
// 2.隐式节点,拆分节点,并添加叶节点
if (m_activeNode.m_bImplicit) {
// 拆分:
pEdge = m_activeNode.getEdge();// 上面边
// 添加1个节点
pTmpNode = new Node(pEdge);// 中间节点
// 拆分为2条边
pTmpEdge = new Edge(matchePos, pEdge.getVirtualEndPos(),
pTmpNode, this);// 下面边 pEdge.updateEndPosition(matchePos);// 若是叶子节点时,从此摆脱叶子节点
pTmpEdge.setNextNode(pEdge.getNextNode());
pEdge.getNextNode().setFrontEdge(pTmpEdge);// 分割时忘了把子节点的父亲边改为下面边!!!
pEdge.setNextNode(pTmpNode);
pTmpNode.addEdge(pTmpEdge); // 隐式节点变显示节点,并作为激活节点(有点牵强?)
m_activeNode.setExplicitNode(pTmpNode);
// 后缀链接
if (m_pOldNode != null)
m_pOldNode.setNextSuffixPointer(m_activeNode.m_pNode);
m_pOldNode = m_activeNode.m_pNode;

if(debug)
System.out.println(String.format("\nsplit:\n %d,%d,%d [%s]\n %d,%d [%s--%s] \n",
pEdge.getStartPos(), pEdge.getEndPos(),
pTmpEdge.getEndPos(),
subData(pEdge.getStartPos(),
pTmpEdge.getEndPos()- pEdge.getStartPos()), start, start + 1,
subData(pEdge.getStartPos(),
pEdge.getEndPos()- pEdge.getStartPos()), subData(start, 1)));
}
// 3.显示节点,仅在显示节点下新增叶子节点
// 显示节点下新增叶子节点(含上面隐式变显示的节点)
pTmpEdge = new Edge(start, 0, m_activeNode.m_pNode, this);// end为0时表示叶子节点
m_activeNode.m_pNode.addEdge(pTmpEdge); int endPos = 0;
if (m_activeNode.m_bImplicit)
endPos = m_activeNode.getEdge().getEndPos();
else if (m_activeNode.m_pNode != m_pRoot)
endPos = m_activeNode.m_pNode.getFrontEdge().getEndPos();
if(debug)
System.out.println(String.format(
"\nnew:\n %d-(%d,%d '%s'--'%s')\n\n", endPos, start,
start + 1, subData(endPos - 1, 1), subData(pTmpEdge
.getStartPos(), pTmpEdge.charSize()))); // 匹配更短的后缀
// 判断后缀指针是否存在
if (m_activeNode.m_pNode.hasNextSuffixPointer()) {
m_activeNode.setExplicitNode(m_activeNode.m_pNode
.getNextSuffix());
}
// 若不存在则手动搜索下一个更短后缀
else {
// 4.当前后缀已经为根节点(非隐含),退出
if (m_activeNode.m_pNode == m_pRoot)// !!!
bMatched = true;
else {
CharPosition pos =new CharPosition();
pTmpNode = m_activeNode.m_pNode;
// 要查找的更短后缀
String str = "";
// 无论如何此时m_activeNode为显示节点
/*/ (开始就是根节点下的)隐含节点
if (m_activeNode.m_bImplicit)// pTmpEdge==null
{
pTmpEdge = m_activeNode.getEdge();
str = subData(pTmpEdge.getStartPos(),
m_activeNode.m_nImplicitNodeOffsetInEdge);
}// */
while ((pTmpEdge = pTmpNode.getFrontEdge()) != null) {
str=subData(pTmpEdge.getStartPos(),pTmpEdge.charSize())+str;
pTmpNode = pTmpEdge.getFrontNode();
if (pTmpNode == m_pRoot)// 找到根节点下面的边
{
break;
}
}
str=str.substring(1);//str.erase(0, 1);
// 不连续时会出问题!!!
// str=this.subData(pTmpEdge.getStartPos()+1,matchePos-(pTmpEdge.getStartPos()+1));

if (str.length() == 0)// 表示没有更短的后缀了,将根节点作为激活节点
{
m_activeNode.setExplicitNode(m_pRoot);
}
else if (search(str, pos))// 必定能找到,查找到结果后,隐式节点返回值pTmpNode为父节点指针
{
m_activeNode.m_pNode = pos.pNode;
m_activeNode.m_nImplicitNodeEdge = pos.edgeIndex;
m_activeNode.m_bImplicit = pos.bImplicit;
m_activeNode.m_nImplicitNodeOffsetInEdge = pos.charIndex; } else {
if (str.length() > 20)
str = str.substring(0, 10) + "..."
+ str.substring(str.length() - 10);
System.out.println(String.format(
"-------------- error: when search '%s' -----------",
str)); m_activeNode.setExplicitNode(m_pRoot);
}
}
}
}// end of else
}// end of while
}

解决方案 »

免费领取超大流量手机卡，每月29元包185G流量+100分钟通话, 中国电信官方发货

楼主辛苦了，速度测试工具和代码转换工具我都要 [email protected]
回楼上两位,速度测试的话,只要调用api获取起至时间就ok了,比如 java可以用new Date()或
System.currentTimeMillis(),c的话调用time()就好了,转换程序的话我给你们发过去吧(cpp2java,c++写的,里面有编译好的可执行文件).
楼主，可否给我发一份C++代码？另外我对Ukkonen的算法有点不懂的地方，想请教你一下，不知道可否？[email protected]
这样用： static public KeywordMatcher showExample(KeywordMatcher m){
if (m == null){
Map<String, Object> keywords = new HashMap<String, Object>();

keywords.clear();
keywords.put("中国", "中国");
keywords.put("中国人", "中国人");
keywords.put("中华人民共和国", "中华人民共和国");
keywords.put("毛泽东", "毛泽东");
keywords.put("江泽民", "江泽民");
keywords.put("天安门", "天安门");
keywords.put("年", "年");
keywords.put("北京", "北京");
keywords.put("上海", "上海");
keywords.put("，", "逗号"); System.out.println("*** 关键词表 *******");
for (String w: keywords.keySet()){
System.out.format("\t %-15s ---> %s\n", w, keywords.get(w));
}
System.out.println(); m = new KeywordMatcher(keywords);

}
String s ="1949年10月1日，在北京天安门上，毛泽东庄严宣布，\n"
+ "中华人民共和国成立了，从此，中国人民站起来了。这是全中国人民的节日，\n"
+ "北京、上海等地的人民欢呼雀跃。";
System.out.println("*** 文本 *******");
System.out.println(s);
System.out.println();

Map<Object, MutableInt> result = m.match(s);

System.out.println("*** 结果 *******");
for (Object o: result.keySet()){
System.out.format("\t %-15s ===> %d\n", o, result.get(o).intValue());
}
System.out.println();
return m;
}输出结果：*** 关键词表 *******
上海              ---> 上海
北京              ---> 北京
天安门             ---> 天安门
中国人             ---> 中国人
江泽民             ---> 江泽民
中华人民共和国         ---> 中华人民共和国
年               ---> 年
，               ---> 逗号
毛泽东             ---> 毛泽东
中国              ---> 中国*** 文本 *******
1949年10月1日，在北京天安门上，毛泽东庄严宣布，
中华人民共和国成立了，从此，中国人民站起来了。这是全中国人民的节日，
北京、上海等地的人民欢呼雀跃。*** 结果 *******
上海              ===> 1
北京              ===> 2
中国人             ===> 2
天安门             ===> 1
中华人民共和国         ===> 1
年               ===> 1
毛泽东             ===> 1
逗号              ===> 6
以及，如果是对URL进行匹配：
static public UrlStartWithMatcher showExample(UrlStartWithMatcher m){
if (m == null){
Map<String, Object> heads = new HashMap<String, Object>(); ////////////  URL匹配  ///////////////
heads.clear();
for (String s: new String[] {
"news.sina.com.cn",
"*.sina.com.cn",
"*.sina.com.cn/z",
"*.sina.com.cn/images",
"*.baidu.cn",
"*.baidu.com",
"*.baidu.com/se",
"*.baidu.com/set",
"www.baidu.com/search/",
"www.baidu.com/",
"*.sina.com.cn/news/daily/a.jpg"
}){
heads.put(s, s);
}
System.out.println("   ------- 匹配对应表 -------");
for (String pattern: heads.keySet()){
System.out.format("   %-35s ---> %s\n", pattern, heads.get(pattern));
}
System.out.println();

m = new UrlStartWithMatcher(heads);
}
System.out.println("   ------- 匹配结果 -------");
for (String s: new String[] {
"http://new-s.sina.com.cn",
"http://news.sina.com.cn",
"https://news_sina3com4cn",
"https://news.sina.com.cn/z/2010chunyun/index.shtml",
"http://h3.news.sina.com.cn/z/2010chunyun/index.shtml",
"ent.sina.com.cn/entertainment/x/3/a.html",
"news.sina.com.cn",
"news_sina3com4cn",
"news.sina.com.cn/z/2010chunyun/index.shtml",
"ent.sina.com.cn/entertainment/x/3/a.html",
"hollywood.sina.com.cn/entertainment/news/today.html",
"www.baidu.com/search?key=3&a=b",
"mp3.baidu.com/download/song=1",
"video.baidu.com/screen/video=2",
"www.baidu.com/news/a.html",
"www.baidu.com/setup/",
"www.baidu.com/search/abcd.jpg",
"www.baidu.com/search/mp3/test.jpg",
"www.baidu.com/search/video/flash/new.jpg",
"www.baidu.com/news/daily/headline.jpg",
"www.baidu.com/news/daily/common/headline.jpg"
}){
Object o = m.match(s);
System.out.format("   %-54s ===> %s\n", s, (o!=null ? o.toString() : "null"));
}
return m;
}   ------- 匹配对应表 -------
   *.baidu.com/set                     ---> *.baidu.com/set
   *.sina.com.cn                       ---> *.sina.com.cn
   *.baidu.com                         ---> *.baidu.com
   *.sina.com.cn/images                ---> *.sina.com.cn/images
   news.sina.com.cn                    ---> news.sina.com.cn
   www.baidu.com/search/               ---> www.baidu.com/search/
   *.baidu.cn                          ---> *.baidu.cn
   *.sina.com.cn/news/daily/a.jpg      ---> *.sina.com.cn/news/daily/a.jpg
   *.sina.com.cn/z                     ---> *.sina.com.cn/z
   *.baidu.com/se                      ---> *.baidu.com/se
   www.baidu.com/                      ---> www.baidu.com/   ------- 匹配结果 -------
   http://new-s.sina.com.cn                               ===> *.sina.com.cn
   http://news.sina.com.cn                                ===> news.sina.com.cn
   https://news_sina3com4cn                               ===> null
   https://news.sina.com.cn/z/2010chunyun/index.shtml     ===> *.sina.com.cn/z
   http://h3.news.sina.com.cn/z/2010chunyun/index.shtml   ===> *.sina.com.cn/z
   ent.sina.com.cn/entertainment/x/3/a.html               ===> *.sina.com.cn
   news.sina.com.cn                                       ===> news.sina.com.cn
   news_sina3com4cn                                       ===> null
   news.sina.com.cn/z/2010chunyun/index.shtml             ===> *.sina.com.cn/z
   ent.sina.com.cn/entertainment/x/3/a.html               ===> *.sina.com.cn
   hollywood.sina.com.cn/entertainment/news/today.html    ===> *.sina.com.cn
   www.baidu.com/search?key=3&a=b                         ===> *.baidu.com/se
   mp3.baidu.com/download/song=1                          ===> *.baidu.com
   video.baidu.com/screen/video=2                         ===> *.baidu.com
   www.baidu.com/news/a.html                              ===> www.baidu.com/
   www.baidu.com/setup/                                   ===> *.baidu.com/set
   www.baidu.com/search/abcd.jpg                          ===> www.baidu.com/search/
   www.baidu.com/search/mp3/test.jpg                      ===> www.baidu.com/search/
   www.baidu.com/search/video/flash/new.jpg               ===> www.baidu.com/search/
   www.baidu.com/news/daily/headline.jpg                  ===> www.baidu.com/
   www.baidu.com/news/daily/common/headline.jpg           ===> www.baidu.com/