-- Table "refgene" DDLCREATE TABLE `refgene` (
`bin` smallint(5) unsigned NOT NULL DEFAULT '0',
`name` varchar(255) NOT NULL DEFAULT '',
`chrom` varchar(255) NOT NULL DEFAULT '',
`strand` char(1) NOT NULL DEFAULT '',
`txStart` int(11) unsigned NOT NULL DEFAULT '0',
`txEnd` int(11) unsigned NOT NULL DEFAULT '0',
`cdsStart` int(11) unsigned NOT NULL DEFAULT '0',
`cdsEnd` int(11) unsigned NOT NULL DEFAULT '0',
`exonCount` int(10) unsigned NOT NULL DEFAULT '0',
`exonStarts` longblob NOT NULL,
`exonEnds` longblob NOT NULL,
`id` int(10) unsigned NOT NULL DEFAULT '0',
`name2` varchar(255) NOT NULL DEFAULT '',
`cdsStartStat` enum('none','unk','incmpl','cmpl') NOT NULL DEFAULT 'none',
`cdsEndStat` enum('none','unk','incmpl','cmpl') NOT NULL DEFAULT 'none',
`exonFrames` longblob NOT NULL,
KEY `chrom` (`chrom`(7),`bin`),
KEY `name2` (`name2`(10)),
KEY `name` (`name`(12)),
KEY `chrom_2` (`chrom`(7),`txStart`),
KEY `chrom_3` (`chrom`(7),`txEnd`),
KEY `ix_all` (`chrom`,`strand`,`txStart`,`txEnd`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
-- Table "g20100806_wjh_h_" DDLCREATE TABLE `g20100806_wjh_h_` (
`chromosome` varchar(255) DEFAULT NULL,
`Chip` varchar(255) DEFAULT NULL,
`start` varchar(255) DEFAULT NULL,
`stop` varchar(255) DEFAULT NULL,
`probeids` varchar(255) DEFAULT NULL,
`seq_id` varchar(255) DEFAULT NULL,
`count` varchar(255) DEFAULT NULL,
`ratioH_1` varchar(255) DEFAULT NULL,
`ratioH_2` varchar(255) DEFAULT NULL,
`ratioH_3` varchar(255) DEFAULT NULL,
KEY `ix_all` (`chromosome`,`start`,`stop`),
KEY `ix_chr` (`chromosome`),
KEY `ix_start` (`start`),
KEY `ix_stop` (`stop`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
我想通过这两张表得到第三张表。如下:
//时间和空间复杂度都比较高,不予考虑
delimiter //
create procedure p_create_temp3(length int)
begin
create table temp3
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
where refGene.chrom=g20100806_wjh_h_.chromosome and ((strand='+' and ((start<=txstart-length and stop>=txstart-length) or (start>=txstart-length and stop<=txstart) or (start>=txstart-length and stop>=txstart)))
or
(strand='-' and ((start<=txend and stop>=txend) or (start>=txend and stop<=txend+length) or (start>=txend and stop>=txend+length))));
end
delimiter ;
call p_create_temp3(20);
请问:如何提高其效率,表g20100806_wjh_h_的索引用不上。又或者有没有什么其他的方法可以代替呢?谢谢
show index from g20100806_wjh_h_;
explain select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend
from refgene,g20100806_wjh_h_
where refGene.chrom=g20100806_wjh_h_.chromosome
and (
(strand='+'
and ((start<=txstart-length and stop>=txstart-length) or (start>=txstart-length and stop<=txstart) or (start>=txstart-length and stop>=txstart))
)
or
(strand='-' and ((start<=txend and stop>=txend) or (start>=txend and stop<=txend+length) or (start>=txend and stop>=txend+length)))
);
贴出三句的结果以供分析。
oot@localhost : tss Sun Sep 26 12:18:55 2010>show index from refgene;
+---------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+
| Table | Non_unique | Key_name | Seq_in_index | Column_name | Collation | Cardinality | Sub_part | Packed | Null | Index_type | Comment |
+---------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+
| refgene | 1 | chrom | 1 | chrom | A | 46 | 7 | NULL | | BTREE | |
| refgene | 1 | chrom | 2 | bin | A | 12013 | NULL | NULL | | BTREE | |
| refgene | 1 | name2 | 1 | name2 | A | 18020 | 10 | NULL | | BTREE | |
| refgene | 1 | name | 1 | name | A | 36040 | 12 | NULL | | BTREE | |
| refgene | 1 | chrom_2 | 1 | chrom | A | 46 | 7 | NULL | | BTREE | |
| refgene | 1 | chrom_2 | 2 | txStart | A | 36040 | NULL | NULL | | BTREE | |
| refgene | 1 | chrom_3 | 1 | chrom | A | 46 | 7 | NULL | | BTREE | |
| refgene | 1 | chrom_3 | 2 | txEnd | A | 36040 | NULL | NULL | | BTREE | |
| refgene | 1 | ix_all | 1 | chrom | A | 46 | NULL | NULL | | BTREE | |
| refgene | 1 | ix_all | 2 | strand | A | 83 | NULL | NULL | | BTREE | |
| refgene | 1 | ix_all | 3 | txStart | A | 36040 | NULL | NULL | | BTREE | |
| refgene | 1 | ix_all | 4 | txEnd | A | 36040 | NULL | NULL | | BTREE | |
+---------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+
12 rows in set (0.05 sec)oot@localhost : tss Sun Sep 26 12:20:01 2010>show index from g20100806_wjh_h_;
+------------------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+
| Table | Non_unique | Key_name | Seq_in_index | Column_name | Collation | Cardinality | Sub_part | Packed | Null | Index_type | Comment |
+------------------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+
| g20100806_wjh_h_ | 1 | ix_all | 1 | chromosome | A | 42 | NULL | NULL | YES | BTREE | |
| g20100806_wjh_h_ | 1 | ix_all | 2 | start | A | 385019 | NULL | NULL | YES | BTREE | |
| g20100806_wjh_h_ | 1 | ix_all | 3 | stop | A | 385019 | NULL | NULL | YES | BTREE | |
| g20100806_wjh_h_ | 1 | ix_chr | 1 | chromosome | A | 42 | NULL | NULL | YES | BTREE | |
| g20100806_wjh_h_ | 1 | ix_start | 1 | start | A | 385019 | NULL | NULL | YES | BTREE | |
| g20100806_wjh_h_ | 1 | ix_stop | 1 | stop | A | 385019 | NULL | NULL | YES | BTREE | |
+------------------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+
6 rows in set (0.00 sec)
oot@localhost : tss Sun Sep 26 09:38:19 2010>explain select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
-> where refGene.chrom=g20100806_wjh_h_.chromosome and ((txstart>=start+20 and txstart<=stop+20) or (txstart<=start+20 and txstart>=stop) or (txstart<=start+20 and txstart<=stop)) and strand='+' \G;
*************************** 1. row ***************************
id: 1
select_type: SIMPLE
table: g20100806_wjh_h_
type: ALL
possible_keys: ix_all,ix_chr,ix_stop
key: NULL
key_len: NULL
ref: NULL
rows: 385019
Extra:
*************************** 2. row ***************************
id: 1
select_type: SIMPLE
table: refgene
type: ref
possible_keys: chrom,chrom_2,chrom_3,ix_all
key: ix_all
key_len: 258
ref: tss.g20100806_wjh_h_.chromosome,const
rows: 434
Extra: Using where
2 rows in set (0.00 sec)大概就是这样
create procedure p_create_temp3(length int)
begin
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
where refGene.chrom=g20100806_wjh_h_.chromosome and strand='+' and ((start<=txstart-length and stop>=txstart-length) or (start>=txstart-length and stop<=txstart) or (start>=txstart-length and stop>=txstart))
into outfile 'd:/temp3_positive.txt';
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
where refGene.chrom=g20100806_wjh_h_.chromosome and strand='-' and ((start<=txend and stop>=txend) or (start>=txend and stop<=txend+length) or (start>=txend and stop>=txend+length))
into outfile 'd:/temp3_negative.txt';
end
delimiter ;
call p_create_temp3(1200);逻辑只能简化到这了。
这样再将文件导入到表中。
可是就单个sql语句也很复杂,大表的索引用不上,速度超慢的
如果strand=‘-’,则start,stop也在令一个特定的区间,这个区间也和length有关
from refgene,g20100806_wjh_h_
where refGene.chrom=g20100806_wjh_h_.chromosome
and
(
(strand='+' and
(
(start<=txstart-length and stop>=txstart-length)
or
(start>=txstart-length and stop<=txstart)
or
(start>=txstart-length and stop>=txstart)
)
)
or
(strand='-' and
(
(start<=txend and stop>=txend)
or
(start>=txend and stop<=txend+length)
or
(start>=txend and stop>=txend+length)
)
)
);你的语句的逻辑很奇怪!
比如下面的。 (strand='+' and
(
(start<=txstart-length and stop>=txstart-length)
or
(start>=txstart-length and stop<=txstart)
or
(start>=txstart-length and stop>=txstart)
)
)这两句红色部分
(start>=txstart-length and stop<=txstart) or (start>=txstart-length and stop>=txstart)
就等同于start>=txstart-length and ( stop<=txstart or stop>=txstart)
也就是 start>=txstart-length and TRUE
建议你能用语言描述 一下实际的业务逻辑。
refGene(chrom,strand,txstart)
refGene(chrom,strand,txend)然后运行如下语句。
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend
from refgene,g20100806_wjh_h_
where chrom=chromosome
and strand='+' and stop+length>=txstart
union all
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend
from refgene,g20100806_wjh_h_
where chrom=chromosome
and strand='+' and start+length>=txstart
union all
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend
from refgene,g20100806_wjh_h_
where chrom=chromosome
and strand='-' and stop>=txend
union all
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend
from refgene,g20100806_wjh_h_
where chrom=chromosome
and strand='-' and start>=txend;
delimiter //
create procedure p_create_temp3(length int)
begin
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
where refGene.chrom=g20100806_wjh_h_.chromosome and strand='+' and ((start<=txstart-length and stop>=txstart-length) or (start>=txstart-length and stop<=txstart) or (start<=txstart and stop>=txstart))
into outfile 'd:/temp3_positive.txt';
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
where refGene.chrom=g20100806_wjh_h_.chromosome and strand='-' and ((start<=txend and stop>=txend) or (start>=txend and stop<=txend+length) or (start<=txend+length and stop>=txend+length))
into outfile 'd:/temp3_negative.txt';
create table t_3 select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_ limit 0;
load data infile 'd:/temp3_positive.txt' into table t_3 fields terminated by '\t' escaped by '\\' lines terminated by '\n';
load data infile 'd:/temp3_negative.txt' into table t_3 fields terminated by '\t' escaped by '\\' lines terminated by '\n'; end
delimiter ;
call p_create_temp3(1200);
这样写的话,索引应该怎样建比较好呢?
这样写是为了能用上联合索引吧,可是为什么用union all而不用union呢?
原来我的写法为什么用不上索引呢?我也建了联合索引啊
建议说明一下。原来我的写法为什么用不上索引呢?我也建了联合索引啊
refGene(chrom,strand,txstart)
refGene(chrom,strand,txend)
这两个索引你建了吗?这样写是为了能用上联合索引吧,可是为什么用union all而不用union呢?
如果担心重复则应该用UNION。
from refgene,g20100806_wjh_h_
where chrom=chromosome
and strand='+' and stop+length>=txstart
union all
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend
from refgene,g20100806_wjh_h_
where chrom=chromosome
and strand='-' and stop>=txend
stop是永远大于start
原来的逻辑有点问题,我改正过来了麻烦你再重新看一下我的sql语句。
delimiter //
create procedure p_create_temp3(length int)
beginselect chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
where refGene.chrom=g20100806_wjh_h_.chromosome and strand='+' and ((start<=txstart-length and stop>=txstart-length) or (start>=txstart-length and stop<=txstart) or (start<=txstart and stop>=txstart))
into outfile 'd:/temp3_positive.txt';
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
where refGene.chrom=g20100806_wjh_h_.chromosome and strand='-' and ((start<=txend and stop>=txend) or (start>=txend and stop<=txend+length) or (start<=txend+length and stop>=txend+length))
into outfile 'd:/temp3_negative.txt';
create table t_3 select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_ limit 0;
load data infile 'd:/temp3_positive.txt' into table t_3 fields terminated by '\t' escaped by '\\' lines terminated by '\n';
load data infile 'd:/temp3_negative.txt' into table t_3 fields terminated by '\t' escaped by '\\' lines terminated by '\n'; end
delimiter ;
call p_create_temp3(1200);
这样写的话,索引应该怎样建比较好呢?
我觉得你的
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend
from refgene,g20100806_wjh_h_
where chrom=chromosome
and strand='+' and stop+length>=txstart
union all
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend
from refgene,g20100806_wjh_h_
where chrom=chromosome
and strand='-' and stop>=txend
应该和我的逻辑不一样吧??????
root@localhost : tss Sun Sep 26 16:33:17 2010>show index from refgene;
+---------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+
| Table | Non_unique | Key_name | Seq_in_index | Column_name | Collation | Cardinality | Sub_part | Packed | Null | Index_type | Comment |
+---------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+
| refgene | 1 | ix_start | 1 | chrom | A | 46 | NULL | NULL | | BTREE | |
| refgene | 1 | ix_start | 2 | strand | A | 83 | NULL | NULL | | BTREE | |
| refgene | 1 | ix_start | 3 | txStart | A | 36040 | NULL | NULL | | BTREE | |
| refgene | 1 | ix_end | 1 | chrom | A | 46 | NULL | NULL | | BTREE | |
| refgene | 1 | ix_end | 2 | strand | A | 83 | NULL | NULL | | BTREE | |
| refgene | 1 | ix_end | 3 | txEnd | A | 36040 | NULL | NULL | | BTREE | |
+---------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+
6 rows in set (0.00 sec)
root@localhost : tss Sun Sep 26 15:45:03 2010>explain
-> select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
-> where refGene.chrom=g20100806_wjh_h_.chromosome and strand='+' and start+20<=txstart and stop+20>=txstart
-> union
-> select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
-> where refGene.chrom=g20100806_wjh_h_.chromosome and strand='+' and start+20>=txstart and stop<=txstart
-> union
-> select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
-> where refGene.chrom=g20100806_wjh_h_.chromosome and strand='+' and start<=txstart and stop>=txstart \G;
*************************** 1. row ***************************
id: 1
select_type: PRIMARY
table: g20100806_wjh_h_
type: ALL
possible_keys: ix_all,ix_chr
key: NULL
key_len: NULL
ref: NULL
rows: 385019
Extra:
*************************** 2. row ***************************
id: 1
select_type: PRIMARY
table: refgene
type: ref
possible_keys: ix_start,ix_end
key: ix_start
key_len: 258
ref: tss.g20100806_wjh_h_.chromosome,const
rows: 434
Extra: Using where
*************************** 3. row ***************************
id: 2
select_type: UNION
table: g20100806_wjh_h_
type: ALL
possible_keys: ix_all,ix_chr,ix_stop
key: NULL
key_len: NULL
ref: NULL
rows: 385019
Extra:
*************************** 4. row ***************************
id: 2
select_type: UNION
table: refgene
type: ref
possible_keys: ix_start,ix_end
key: ix_start
key_len: 258
ref: tss.g20100806_wjh_h_.chromosome,const
rows: 434
Extra: Using where
*************************** 5. row ***************************
id: 3
select_type: UNION
table: g20100806_wjh_h_
type: ALL
possible_keys: ix_all,ix_chr,ix_start,ix_stop
key: NULL
key_len: NULL
ref: NULL
rows: 385019
Extra:
*************************** 6. row ***************************
id: 3
select_type: UNION
table: refgene
type: ref
possible_keys: ix_start,ix_end
key: ix_start
key_len: 258
ref: tss.g20100806_wjh_h_.chromosome,const
rows: 434
Extra: Using where
*************************** 7. row ***************************
id: NULL
select_type: UNION RESULT
table: <union1,2,3>
type: ALL
possible_keys: NULL
key: NULL
key_len: NULL
ref: NULL
rows: NULL
Extra:
7 rows in set (0.00 sec)我按照你的方法建了索引,调整了sql语句,结果如上,有一张表是全文扫描rows: 385019
搞不懂连接表是如何利用索引的??
where refGene.chrom=g20100806_wjh_h_.chromosome and strand='+' and start+20<=txstart and stop+20>=txstart where refGene.chrom=g20100806_wjh_h_.chromosome and strand='+' and start+20>=txstart and stop<=txstartwhere refGene.chrom=g20100806_wjh_h_.chromosome and strand='+' and start<=txstart and stop>=txstart
就这三个条件,将满足这三个条件的结果集合并(union)这是对于strand为‘+’的情况
其实说白了,就是chrom相等,strand为‘+’,start与stop表示一个区间,这个区间只要和[txstart-length,txend]有相交的地方,就认为是我所需要的结果对于strand为‘-’情况也一样。不同的是[start,stop]与区间[txend,txend+length]相交的地方就是我想要的。
您看我这样描述你清楚了么?
前面的很多where,or,union都是用来判断是否相交用的。
楼主能认真对待一下你自己进出的需求吗?! 虽然从CSDN可以得到免费的帮助,但也不是用来浪费的啊。
两张表中的关联字段:
refGene.chrom=g20100806_wjh_h_.chromosome 这是连接的条件
strand='+/-'有两种情况
为‘+’时,将g20100806_wjh_h_表中的每一条记录去和refgene比较,拿如果如果【start,stop】与区间【txstart-length,txend】相交,则该条记录就满足需求,并将其插入到我生成的新表中。
为'-',将g20100806_wjh_h_表中的每一条记录去和refgene比较,如果【start,stop】与区间【txend,txend+length】相交,则该记录就是我们需要的。
(其中start永远小于stop,txstart永远小于txend)
不知道这样你是否清楚?
chrom相等,strand='-',如果【start,stop】与区间【txend,txend+length】相交,则该记录就是我们需要的。请确认红字部分没有错。
如果错了,就结贴重开再问吧。