下面是其中一个xml文件,路径为:F:\corpus samples\sen0001.xml
<?xml version="1.0" encoding="gb2312" standalone="no" ?>
- <utterance total_sample="124907">
<sil end_sample="8731" /> //这个应作为第一个start_sample
- <sentence>
- <prosodic_phrase>
- <prosodic_word>
<syllable ch_char="为" pinyin="wei4" end_sample="14212" />
<sil end_sample="14318" />
<syllable ch_char="临" pinyin="lin2" end_sample="18632" />
<sil end_sample="18703" />
<syllable ch_char="帖" pinyin="tie4" end_sample="25740" />
</prosodic_word>
</prosodic_phrase>
- <prosodic_phrase>
- <prosodic_word>
<sil end_sample="30319" />
<syllable ch_char="他" pinyin="ta1" end_sample="33343" />
<sil end_sample="33414" />
<syllable ch_char="还" pinyin="hai2" end_sample="37391" bad="yes" />
</prosodic_word>
- <prosodic_word>
<sil end_sample="37462" />
<syllable ch_char="远" pinyin="yuan3" end_sample="41617" />
<sil end_sample="41741" />
<syllable ch_char="游" pinyin="you2" end_sample="47168" />
</prosodic_word>
- <prosodic_word>
<sil end_sample="47232" />
<syllable ch_char="西" pinyin="xi1" end_sample="50740" />
<sil end_sample="50775" />
<syllable ch_char="安" pinyin="an1" end_sample="54736" bad="yes" />
</prosodic_word>
- <prosodic_word>
<sil end_sample="55184" />
<syllable ch_char="碑" pinyin="bei1" end_sample="58192" />
<sil end_sample="58239" />
<syllable ch_char="林" pinyin="lin2" end_sample="65393" />
</prosodic_word>
</prosodic_phrase>
- <prosodic_phrase>
- <prosodic_word>
<sil end_sample="69120" />
<syllable ch_char="龙" pinyin="long2" end_sample="73037" />
<sil end_sample="73081" />
<syllable ch_char="门" pinyin="men2" end_sample="77216" />
</prosodic_word>
- <prosodic_word>
<sil end_sample="77291" />
<syllable ch_char="石" pinyin="shi2" end_sample="81843" />
<sil end_sample="82011" />
<syllable ch_char="窟" pinyin="ku1" end_sample="87964" />
</prosodic_word>
</prosodic_phrase>
- <prosodic_phrase>
- <prosodic_word>
<sil end_sample="92561" />
<syllable ch_char="泰" pinyin="tai4" end_sample="97200" />
<sil end_sample="97313" />
<syllable ch_char="山" pinyin="shan1" end_sample="102455" />
</prosodic_word>
- <prosodic_word>
<sil end_sample="102508" />
<syllable ch_char="摩" pinyin="mo2" end_sample="106749" bad="yes" />
<sil end_sample="106820" />
<syllable ch_char="崖" pinyin="ya2" end_sample="110016" />
</prosodic_word>
- <prosodic_word>
<sil end_sample="110159" />
<syllable ch_char="石" pinyin="shi2" end_sample="114153" />
<sil end_sample="114416" />
<syllable ch_char="刻" pinyin="ke4" end_sample="119772" />
</prosodic_word>
</prosodic_phrase>
<sil end_sample="124907" /> //这个不必存入表格
</sentence>
</utterance>
这是我处理的程序:DECLARE @idoc int
DECLARE @doc varchar(8000)--从文件中读出XML内容到临时表
create table #tb (doc varchar(8000))
bulk insert #tb from 'F:\corpus samples\sen0001.xml'--将读取到的数据保存到变量中
set @doc=''
select @doc=@doc+doc from #tb--删除临时表
drop table #tb--获取数据
EXEC sp_xml_preparedocument @idoc OUTPUT, @doc--创建测试的表
create table ta
(ch_char char(4),pinyin char(10),start_sample int,end_sample int)insert into ta(ch_char,pinyin,end_sample)
SELECT * FROM OPENXML (@idoc, '/utterance/sentence/prosodic_phrase/prosodic_word/syllable',2)
WITH (ch_char char(4) '@ch_char',
pinyin char(10) '@pinyin',
end_sample int '@end_sample')insert into ta(start_sample)
SELECT * FROM OPENXML (@idoc, '/utterance/sil',2)
WITH (start_sample int '@end_sample')insert into ta(start_sample)
SELECT * FROM OPENXML (@idoc, '/utterance/sentence/prosodic_phrase/prosodic_word/sil',2)
WITH (start_sample int '@end_sample')
EXEC sp_xml_removedocument @idoc--建立临时表
select * , id=identity(int) into #tt from ta
--得到表中记录数的一半
declare @num int
select @num=count(*) from #tt
set @num=(@num+1)/2
--查询结果
insert into syllables(ch_char,pinyin,start_sample,end_sample)
select ch_char, pinyin,start_sample=isnull(start_sample, (select start_sample from #tt as tt where tt.id=#tt.id+@num)),end_sample
from #tt
where id<=@num--清除
drop table #tt
drop table ta--显示处理结果
--select * from syllables想得到这样的表格:
word pinyin start_sample end_sample file_name
--------------------------------------------
为 wei4 873114212 sen0001.xml
临 lin2 1431818632 sen0001.xml
帖 tie4 1870325740 sen0001.xml
他 ta1 3031933343 sen0001.xml
还 hai2 3341437391 sen0001.xml
远 yuan3 3746241617 sen0001.xml
游 you2 4174147168 sen0001.xml
西 xi1 4723250740 sen0001.xml
安 an1 5077554736 sen0001.xml
碑 bei1 5518458192 sen0001.xml
林 lin2 5823965393 sen0001.xml
龙 long2 6912073037 sen0001.xml
门 men2 7308177216 sen0001.xml
石 shi2 7729181843 sen0001.xml
窟 ku1 8201187964 sen0001.xml
泰 tai4 9256197200 sen0001.xml
山 shan1 97313102455 sen0001.xml
摩 mo2 102508106749 sen0001.xml
崖 ya2 106820110016 sen0001.xml
石 shi2 110159114153 sen0001.xml
刻 ke4 114416119772 sen0001.xml
<?xml version="1.0" encoding="gb2312" standalone="no" ?>
- <utterance total_sample="124907">
<sil end_sample="8731" /> //这个应作为第一个start_sample
- <sentence>
- <prosodic_phrase>
- <prosodic_word>
<syllable ch_char="为" pinyin="wei4" end_sample="14212" />
<sil end_sample="14318" />
<syllable ch_char="临" pinyin="lin2" end_sample="18632" />
<sil end_sample="18703" />
<syllable ch_char="帖" pinyin="tie4" end_sample="25740" />
</prosodic_word>
</prosodic_phrase>
- <prosodic_phrase>
- <prosodic_word>
<sil end_sample="30319" />
<syllable ch_char="他" pinyin="ta1" end_sample="33343" />
<sil end_sample="33414" />
<syllable ch_char="还" pinyin="hai2" end_sample="37391" bad="yes" />
</prosodic_word>
- <prosodic_word>
<sil end_sample="37462" />
<syllable ch_char="远" pinyin="yuan3" end_sample="41617" />
<sil end_sample="41741" />
<syllable ch_char="游" pinyin="you2" end_sample="47168" />
</prosodic_word>
- <prosodic_word>
<sil end_sample="47232" />
<syllable ch_char="西" pinyin="xi1" end_sample="50740" />
<sil end_sample="50775" />
<syllable ch_char="安" pinyin="an1" end_sample="54736" bad="yes" />
</prosodic_word>
- <prosodic_word>
<sil end_sample="55184" />
<syllable ch_char="碑" pinyin="bei1" end_sample="58192" />
<sil end_sample="58239" />
<syllable ch_char="林" pinyin="lin2" end_sample="65393" />
</prosodic_word>
</prosodic_phrase>
- <prosodic_phrase>
- <prosodic_word>
<sil end_sample="69120" />
<syllable ch_char="龙" pinyin="long2" end_sample="73037" />
<sil end_sample="73081" />
<syllable ch_char="门" pinyin="men2" end_sample="77216" />
</prosodic_word>
- <prosodic_word>
<sil end_sample="77291" />
<syllable ch_char="石" pinyin="shi2" end_sample="81843" />
<sil end_sample="82011" />
<syllable ch_char="窟" pinyin="ku1" end_sample="87964" />
</prosodic_word>
</prosodic_phrase>
- <prosodic_phrase>
- <prosodic_word>
<sil end_sample="92561" />
<syllable ch_char="泰" pinyin="tai4" end_sample="97200" />
<sil end_sample="97313" />
<syllable ch_char="山" pinyin="shan1" end_sample="102455" />
</prosodic_word>
- <prosodic_word>
<sil end_sample="102508" />
<syllable ch_char="摩" pinyin="mo2" end_sample="106749" bad="yes" />
<sil end_sample="106820" />
<syllable ch_char="崖" pinyin="ya2" end_sample="110016" />
</prosodic_word>
- <prosodic_word>
<sil end_sample="110159" />
<syllable ch_char="石" pinyin="shi2" end_sample="114153" />
<sil end_sample="114416" />
<syllable ch_char="刻" pinyin="ke4" end_sample="119772" />
</prosodic_word>
</prosodic_phrase>
<sil end_sample="124907" /> //这个不必存入表格
</sentence>
</utterance>
这是我处理的程序:DECLARE @idoc int
DECLARE @doc varchar(8000)--从文件中读出XML内容到临时表
create table #tb (doc varchar(8000))
bulk insert #tb from 'F:\corpus samples\sen0001.xml'--将读取到的数据保存到变量中
set @doc=''
select @doc=@doc+doc from #tb--删除临时表
drop table #tb--获取数据
EXEC sp_xml_preparedocument @idoc OUTPUT, @doc--创建测试的表
create table ta
(ch_char char(4),pinyin char(10),start_sample int,end_sample int)insert into ta(ch_char,pinyin,end_sample)
SELECT * FROM OPENXML (@idoc, '/utterance/sentence/prosodic_phrase/prosodic_word/syllable',2)
WITH (ch_char char(4) '@ch_char',
pinyin char(10) '@pinyin',
end_sample int '@end_sample')insert into ta(start_sample)
SELECT * FROM OPENXML (@idoc, '/utterance/sil',2)
WITH (start_sample int '@end_sample')insert into ta(start_sample)
SELECT * FROM OPENXML (@idoc, '/utterance/sentence/prosodic_phrase/prosodic_word/sil',2)
WITH (start_sample int '@end_sample')
EXEC sp_xml_removedocument @idoc--建立临时表
select * , id=identity(int) into #tt from ta
--得到表中记录数的一半
declare @num int
select @num=count(*) from #tt
set @num=(@num+1)/2
--查询结果
insert into syllables(ch_char,pinyin,start_sample,end_sample)
select ch_char, pinyin,start_sample=isnull(start_sample, (select start_sample from #tt as tt where tt.id=#tt.id+@num)),end_sample
from #tt
where id<=@num--清除
drop table #tt
drop table ta--显示处理结果
--select * from syllables想得到这样的表格:
word pinyin start_sample end_sample file_name
--------------------------------------------
为 wei4 873114212 sen0001.xml
临 lin2 1431818632 sen0001.xml
帖 tie4 1870325740 sen0001.xml
他 ta1 3031933343 sen0001.xml
还 hai2 3341437391 sen0001.xml
远 yuan3 3746241617 sen0001.xml
游 you2 4174147168 sen0001.xml
西 xi1 4723250740 sen0001.xml
安 an1 5077554736 sen0001.xml
碑 bei1 5518458192 sen0001.xml
林 lin2 5823965393 sen0001.xml
龙 long2 6912073037 sen0001.xml
门 men2 7308177216 sen0001.xml
石 shi2 7729181843 sen0001.xml
窟 ku1 8201187964 sen0001.xml
泰 tai4 9256197200 sen0001.xml
山 shan1 97313102455 sen0001.xml
摩 mo2 102508106749 sen0001.xml
崖 ya2 106820110016 sen0001.xml
石 shi2 110159114153 sen0001.xml
刻 ke4 114416119772 sen0001.xml
DECLARE @idoc int
DECLARE @doc varchar(8000)--从文件中读出XML内容到临时表
create table #tb (doc varchar(8000))
bulk insert #tb from 'F:\corpus samples\sen0001.xml'--将读取到的数据保存到变量中
set @doc=''
select @doc=@doc+doc from #tb--删除临时表
drop table #tb--获取数据
EXEC sp_xml_preparedocument @idoc OUTPUT, @doc--创建测试的表
create table ta
(ch_char char(4),pinyin char(10),start_sample int,end_sample int)insert into ta(ch_char,pinyin,end_sample)
SELECT * FROM OPENXML (@idoc, '/utterance/sentence/prosodic_phrase/prosodic_word/syllable',2)
WITH (ch_char char(4) '@ch_char',
pinyin char(10) '@pinyin',
end_sample int '@end_sample')insert into ta(start_sample)
SELECT * FROM OPENXML (@idoc, '/utterance/sil',2)
WITH (start_sample int '@end_sample')insert into ta(start_sample)
SELECT * FROM OPENXML (@idoc, '/utterance/sentence/prosodic_phrase/prosodic_word/sil',2)
WITH (start_sample int '@end_sample')
EXEC sp_xml_removedocument @idoc--建立临时表
select * , id=identity(int) into #tt from ta
--得到表中记录数的一半
declare @num int
select @num=count(*) from #tt
set @num=(@num+1)/2
--查询结果
--insert into syllables(ch_char,pinyin,start_sample,end_sample)
select ch_char as word, pinyin
,start_sample=isnull(start_sample, (select start_sample from #tt as tt where tt.id=#tt.id+@num))
,end_sample
, 'sen0001.xml' as file_name
into syllables
from #tt
where id<=@num--显示处理结果
select * from syllables--清除
drop table #tt
drop table ta
drop table syllables
DECLARE @idoc int
DECLARE @doc varchar(8000)--从文件中读出XML内容到临时表
create table #tb (doc varchar(8000))
bulk insert #tb from 'F:\corpus samples\sen0002.xml'--将读取到的数据保存到变量中
set @doc=''
select @doc=@doc+doc from #tb--删除临时表
drop table #tb--获取数据
EXEC sp_xml_preparedocument @idoc OUTPUT, @doc--创建测试的表
create table ta
(ch_char char(4),pinyin char(10),start_sample int,end_sample int)insert into ta(ch_char,pinyin,end_sample)
SELECT * FROM OPENXML (@idoc, '/utterance/sentence/prosodic_phrase/prosodic_word/syllable',2)
WITH (ch_char char(4) '@ch_char',
pinyin char(10) '@pinyin',
end_sample int '@end_sample')insert into ta(start_sample)
SELECT * FROM OPENXML (@idoc, '/utterance/sil',2)
WITH (start_sample int '@end_sample')insert into ta(start_sample)
SELECT * FROM OPENXML (@idoc, '/utterance/sentence/prosodic_phrase/prosodic_word/sil',2)
WITH (start_sample int '@end_sample')
EXEC sp_xml_removedocument @idoc
--建立临时表
select * , id=identity(int) into #tt from ta
--得到表中记录数的一半
declare @num int
select @num=count(*) from #tt
set @num=(@num+1)/2--查询结果
insert into syllables(ch_char,pinyin,start_sample,end_sample,file_name)
select ch_char, pinyin,start_sample=isnull(start_sample, (select start_sample from #tt as tt where tt.id=#tt.id+@num)),end_sample,
'sen0002.wav' as file_name
from #tt
where id<=@num--清除
drop table #tt
drop table ta
--显示处理结果
select * from syllables运行后
添加到syllables表中的数据为:
ch_char pinyin start_sample end_sample file_name
为 wei4 8731 14212 sen0001.wav
临 lin2 14318 18632 sen0001.wav
帖 tie4 18703 25740 sen0001.wav
他 ta1 30319 33343 sen0001.wav
还 hai2 33414 37391 sen0001.wav
远 yuan3 37462 41617 sen0001.wav
游 you2 41741 47168 sen0001.wav
西 xi1 47232 50740 sen0001.wav
安 an1 50775 54736 sen0001.wav
碑 bei1 55184 58192 sen0001.wav
林 lin2 58239 65393 sen0001.wav
龙 long2 69120 73037 sen0001.wav
门 men2 73081 77216 sen0001.wav
石 shi2 77291 81843 sen0001.wav
窟 ku1 82011 87964 sen0001.wav
泰 tai4 92561 97200 sen0001.wav
山 shan1 97313 102455 sen0001.wav
摩 mo2 102508 106749 sen0001.wav
崖 ya2 106820 110016 sen0001.wav
石 shi2 110159 114153 sen0001.wav
刻 ke4 114416 119772 sen0001.wav 有大量的xml文件,能不能循环导入啊?单个处理太麻烦了
bulk insert #tb from 'F:\corpus samples\sen0002.xml'这一句是不是可以通过循环处理?那么'sen0002.wav' as file_name也要相应地改一下吧?
这个存储过程的形式参数为文件名
(注意表名和列名是不能用变量的,
这种情况,你用动态sql语句来实现就可以)
--示例
use northwind
declare @str varchar(40)
set @str='abc'
select @str as str_sample,*
from orders
例如:
程序中的ta表是实表, 如果改用临时表的话, 应该会快一些。
下面这部分可以做一下优化(例如:后面两个insert into ta可以用一个insert ... select...union all ...select来做; 如果可以将三个insert into合并在一起做的话,那就可以避免后面的程序的二次加工)insert into ta(ch_char,pinyin,end_sample)
SELECT * FROM OPENXML (@idoc, '/utterance/sentence/prosodic_phrase/prosodic_word/syllable',2)
WITH (ch_char char(4) '@ch_char',
pinyin char(10) '@pinyin',
end_sample int '@end_sample')insert into ta(start_sample)
SELECT * FROM OPENXML (@idoc, '/utterance/sil',2)
WITH (start_sample int '@end_sample')insert into ta(start_sample)
SELECT * FROM OPENXML (@idoc, '/utterance/sentence/prosodic_phrase/prosodic_word/sil',2)
WITH (start_sample int '@end_sample')
EXEC sp_xml_removedocument @idoc
呵呵, 说了好多空洞的话。
希望我已经表达清楚了。